From 811fd0cb9e4e17a49599300c4ff2e4c7716eaa2a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 29 Jan 2020 18:23:37 +0900 Subject: [PATCH] Dynamic analyzer (#755) * Removed generics in tokenizers * lowercaser * Added TokenizerExt * Introducing BoxedTokenizer * Introducing BoxXXXXX helper struct * Closes #762. * Introducing a TextAnalyzer --- CHANGELOG.md | 10 + examples/pre_tokenized_text.rs | 2 +- examples/stop_words.rs | 2 +- src/core/index.rs | 7 +- src/indexer/segment_writer.rs | 16 +- src/query/query_parser/query_parser.rs | 4 +- src/snippet/mod.rs | 36 ++-- src/tokenizer/alphanum_only.rs | 56 ++---- src/tokenizer/ascii_folding_filter.rs | 46 ++--- src/tokenizer/facet_tokenizer.rs | 11 +- src/tokenizer/lower_caser.rs | 63 +++--- src/tokenizer/mod.rs | 15 +- src/tokenizer/ngram_tokenizer.rs | 17 +- src/tokenizer/raw_tokenizer.rs | 8 +- src/tokenizer/remove_long.rs | 62 ++---- src/tokenizer/simple_tokenizer.rs | 11 +- src/tokenizer/stemmer.rs | 51 ++--- src/tokenizer/stop_word_filter.rs | 57 ++---- src/tokenizer/token_stream_chain.rs | 23 +-- src/tokenizer/tokenized_string.rs | 13 +- src/tokenizer/tokenizer.rs | 263 ++++++++++++++----------- src/tokenizer/tokenizer_manager.rs | 17 +- 22 files changed, 348 insertions(+), 442 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19af1cfc9..36d75f3e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +Tantivy 0.12.0 +====================== +- Removing static dispatch in tokenizers for simplicity. (#762) + +## How to update? + +Crates relying on custom tokenizer, or registering tokenizer in the manager will require some +minor changes. Check https://github.com/tantivy-search/tantivy/blob/master/examples/custom_tokenizer.rs +to check for some code sample. + Tantivy 0.11.3 ======================= - Fixed DateTime as a fast field (#735) diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index 5cf309ef1..57d867b1b 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -9,7 +9,7 @@ // - import tokenized text straight from json, // - perform a search on documents with pre-tokenized text -use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer}; +use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer}; use tantivy::collector::{Count, TopDocs}; use tantivy::query::TermQuery; diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 091112984..ac7694122 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> { // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus - let tokenizer = SimpleTokenizer + let tokenizer = TextAnalyzer::from(SimpleTokenizer) .filter(LowerCaser) .filter(StopWordFilter::remove(vec![ "the".to_string(), diff --git a/src/core/index.rs b/src/core/index.rs index 28d49a1dd..fe4ce4b72 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -20,8 +20,7 @@ use crate::reader::IndexReaderBuilder; use crate::schema::Field; use crate::schema::FieldType; use crate::schema::Schema; -use crate::tokenizer::BoxedTokenizer; -use crate::tokenizer::TokenizerManager; +use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::IndexWriter; use crate::Result; use num_cpus; @@ -173,11 +172,11 @@ impl Index { } /// Helper to access the tokenizer associated to a specific field. - pub fn tokenizer_for_field(&self, field: Field) -> Result { + pub fn tokenizer_for_field(&self, field: Field) -> Result { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); let tokenizer_manager: &TokenizerManager = self.tokenizers(); - let tokenizer_name_opt: Option = match field_type { + let tokenizer_name_opt: Option = match field_type { FieldType::Str(text_options) => text_options .get_indexing_options() .map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 8ed1025ba..40515bfb0 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -11,10 +11,9 @@ use crate::schema::Schema; use crate::schema::Term; use crate::schema::Value; use crate::schema::{Field, FieldEntry}; -use crate::tokenizer::BoxedTokenizer; -use crate::tokenizer::FacetTokenizer; -use crate::tokenizer::PreTokenizedStream; -use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer}; +use crate::tokenizer::{BoxTokenStream, PreTokenizedStream}; +use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; +use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::DocId; use crate::Opstamp; use crate::Result; @@ -50,7 +49,7 @@ pub struct SegmentWriter { fast_field_writers: FastFieldsWriter, fieldnorms_writer: FieldNormsWriter, doc_opstamps: Vec, - tokenizers: Vec>, + tokenizers: Vec>, } impl SegmentWriter { @@ -159,7 +158,7 @@ impl SegmentWriter { } } FieldType::Str(_) => { - let mut token_streams: Vec> = vec![]; + let mut token_streams: Vec = vec![]; let mut offsets = vec![]; let mut total_offset = 0; @@ -172,7 +171,7 @@ impl SegmentWriter { } token_streams - .push(Box::new(PreTokenizedStream::from(tok_str.clone()))); + .push(PreTokenizedStream::from(tok_str.clone()).into()); } Value::Str(ref text) => { if let Some(ref mut tokenizer) = @@ -191,8 +190,7 @@ impl SegmentWriter { let num_tokens = if token_streams.is_empty() { 0 } else { - let mut token_stream: Box = - Box::new(TokenStreamChain::new(offsets, token_streams)); + let mut token_stream = TokenStreamChain::new(offsets, token_streams); self.multifield_postings .index_text(doc_id, field, &mut token_stream) }; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index b9b6d2462..bd9dc0869 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -533,7 +533,7 @@ mod test { use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT}; use crate::tokenizer::{ - LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager, + LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager, }; use crate::Index; use matches::assert_matches; @@ -563,7 +563,7 @@ mod test { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "en_with_stop_words", - SimpleTokenizer + TextAnalyzer::from(SimpleTokenizer) .filter(LowerCaser) .filter(StopWordFilter::remove(vec!["the".to_string()])), ); diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8c3f62636..ce50b1de1 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,8 +1,7 @@ use crate::query::Query; use crate::schema::Field; use crate::schema::Value; -use crate::tokenizer::BoxedTokenizer; -use crate::tokenizer::{Token, TokenStream}; +use crate::tokenizer::{TextAnalyzer, Token}; use crate::Document; use crate::Result; use crate::Searcher; @@ -142,7 +141,7 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments<'a>( - tokenizer: &BoxedTokenizer, + tokenizer: &TextAnalyzer, text: &'a str, terms: &BTreeMap, max_num_chars: usize, @@ -251,7 +250,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) /// ``` pub struct SnippetGenerator { terms_text: BTreeMap, - tokenizer: BoxedTokenizer, + tokenizer: TextAnalyzer, field: Field, max_num_chars: usize, } @@ -347,12 +346,11 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet() { - let boxed_tokenizer = SimpleTokenizer.into(); let terms = btreemap! { String::from("rust") => 1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100); + let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100); assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -374,13 +372,12 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_scored_fragment() { - let boxed_tokenizer = SimpleTokenizer.into(); { let terms = btreemap! { String::from("rust") =>1.0f32, String::from("language") => 0.9f32 }; - let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20); + let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); { let first = &fragments[0]; assert_eq!(first.score, 1.0); @@ -389,13 +386,12 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); assert_eq!(snippet.to_html(), "Rust is a systems") } - let boxed_tokenizer = SimpleTokenizer.into(); { let terms = btreemap! { String::from("rust") =>0.9f32, String::from("language") => 1.0f32 }; - let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20); + let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); //assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -409,14 +405,12 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_in_second_fragment() { - let boxed_tokenizer = SimpleTokenizer.into(); - let text = "a b c d e f g"; let mut terms = BTreeMap::new(); terms.insert(String::from("c"), 1.0); - let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); + let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 1); { @@ -433,14 +427,12 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_with_term_at_the_end_of_fragment() { - let boxed_tokenizer = SimpleTokenizer.into(); - let text = "a b c d e f f g"; let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); - let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); + let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 2); { @@ -457,15 +449,13 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_with_second_fragment_has_the_highest_score() { - let boxed_tokenizer = SimpleTokenizer.into(); - let text = "a b c d e f g"; let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); terms.insert(String::from("a"), 0.9); - let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7); + let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7); assert_eq!(fragments.len(), 2); { @@ -482,14 +472,12 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_with_term_not_in_text() { - let boxed_tokenizer = SimpleTokenizer.into(); - let text = "a b c d"; let mut terms = BTreeMap::new(); terms.insert(String::from("z"), 1.0); - let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); + let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 0); @@ -500,12 +488,10 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_with_no_terms() { - let boxed_tokenizer = SimpleTokenizer.into(); - let text = "a b c d"; let terms = BTreeMap::new(); - let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); + let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(&fragments[..], &text); diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index 5d15e45e5..9d3436535 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = RawTokenizer +//! let tokenizer = TextAnalyzer::from(RawTokenizer) //! .filter(AlphaNumOnlyFilter); //! //! let mut stream = tokenizer.token_stream("hello there"); @@ -10,7 +10,7 @@ //! // contains a space //! assert!(stream.next().is_none()); //! -//! let tokenizer = SimpleTokenizer +//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) //! .filter(AlphaNumOnlyFilter); //! //! let mut stream = tokenizer.token_stream("hello there 💣"); @@ -19,56 +19,30 @@ //! // the "emoji" is dropped because its not an alphanum //! assert!(stream.next().is_none()); //! ``` -use super::{Token, TokenFilter, TokenStream}; +use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; /// `TokenFilter` that removes all tokens that contain non /// ascii alphanumeric characters. #[derive(Clone)] pub struct AlphaNumOnlyFilter; -pub struct AlphaNumOnlyFilterStream -where - TailTokenStream: TokenStream, -{ - tail: TailTokenStream, +pub struct AlphaNumOnlyFilterStream<'a> { + tail: BoxTokenStream<'a>, } -impl AlphaNumOnlyFilterStream -where - TailTokenStream: TokenStream, -{ +impl<'a> AlphaNumOnlyFilterStream<'a> { fn predicate(&self, token: &Token) -> bool { token.text.chars().all(|c| c.is_ascii_alphanumeric()) } +} - fn wrap(tail: TailTokenStream) -> AlphaNumOnlyFilterStream { - AlphaNumOnlyFilterStream { tail } +impl TokenFilter for AlphaNumOnlyFilter { + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { + BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream }) } } -impl TokenFilter for AlphaNumOnlyFilter -where - TailTokenStream: TokenStream, -{ - type ResultTokenStream = AlphaNumOnlyFilterStream; - - fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - AlphaNumOnlyFilterStream::wrap(token_stream) - } -} - -impl TokenStream for AlphaNumOnlyFilterStream -where - TailTokenStream: TokenStream, -{ - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } - +impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> { fn advance(&mut self) -> bool { while self.tail.advance() { if self.predicate(self.tail.token()) { @@ -78,4 +52,12 @@ where false } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } } diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 7193911ac..1eb6cbb89 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenFilter, TokenStream}; +use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use std::mem; /// This class converts alphabetic, numeric, and symbolic Unicode characters @@ -7,26 +7,21 @@ use std::mem; #[derive(Clone)] pub struct AsciiFoldingFilter; -impl TokenFilter for AsciiFoldingFilter -where - TailTokenStream: TokenStream, -{ - type ResultTokenStream = AsciiFoldingFilterTokenStream; - - fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - AsciiFoldingFilterTokenStream::wrap(token_stream) +impl TokenFilter for AsciiFoldingFilter { + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { + From::from(AsciiFoldingFilterTokenStream { + tail: token_stream, + buffer: String::with_capacity(100), + }) } } -pub struct AsciiFoldingFilterTokenStream { +pub struct AsciiFoldingFilterTokenStream<'a> { buffer: String, - tail: TailTokenStream, + tail: BoxTokenStream<'a>, } -impl TokenStream for AsciiFoldingFilterTokenStream -where - TailTokenStream: TokenStream, -{ +impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -48,18 +43,6 @@ where } } -impl AsciiFoldingFilterTokenStream -where - TailTokenStream: TokenStream, -{ - fn wrap(tail: TailTokenStream) -> AsciiFoldingFilterTokenStream { - AsciiFoldingFilterTokenStream { - tail, - buffer: String::with_capacity(100), - } - } -} - // Returns a string that represents the ascii folded version of // the character. If the `char` does not require ascii folding // (e.g. simple ASCII chars like `A`) or if the `char` @@ -1561,8 +1544,7 @@ mod tests { use crate::tokenizer::AsciiFoldingFilter; use crate::tokenizer::RawTokenizer; use crate::tokenizer::SimpleTokenizer; - use crate::tokenizer::TokenStream; - use crate::tokenizer::Tokenizer; + use crate::tokenizer::TextAnalyzer; use std::iter; #[test] @@ -1579,7 +1561,7 @@ mod tests { fn folding_helper(text: &str) -> Vec { let mut tokens = Vec::new(); - SimpleTokenizer + TextAnalyzer::from(SimpleTokenizer) .filter(AsciiFoldingFilter) .token_stream(text) .process(&mut |token| { @@ -1589,7 +1571,9 @@ mod tests { } fn folding_using_raw_tokenizer_helper(text: &str) -> String { - let mut token_stream = RawTokenizer.filter(AsciiFoldingFilter).token_stream(text); + let mut token_stream = TextAnalyzer::from(RawTokenizer) + .filter(AsciiFoldingFilter) + .token_stream(text); token_stream.advance(); token_stream.token().text.clone() } diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 24f26589e..963e4e30c 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenStream, Tokenizer}; +use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; use crate::schema::FACET_SEP_BYTE; /// The `FacetTokenizer` process a `Facet` binary representation @@ -25,15 +25,14 @@ pub struct FacetTokenStream<'a> { token: Token, } -impl<'a> Tokenizer<'a> for FacetTokenizer { - type TokenStreamImpl = FacetTokenStream<'a>; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { +impl Tokenizer for FacetTokenizer { + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { FacetTokenStream { text, state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. token: Token::default(), } + .into() } } @@ -84,7 +83,7 @@ mod tests { use super::FacetTokenizer; use crate::schema::Facet; - use crate::tokenizer::{Token, TokenStream, Tokenizer}; + use crate::tokenizer::{Token, Tokenizer}; #[test] fn test_facet_tokenizer() { diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index e0895eaf9..830be793d 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,24 +1,23 @@ use super::{Token, TokenFilter, TokenStream}; +use crate::tokenizer::BoxTokenStream; use std::mem; +impl TokenFilter for LowerCaser { + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { + BoxTokenStream::from(LowerCaserTokenStream { + tail: token_stream, + buffer: String::with_capacity(100), + }) + } +} + /// Token filter that lowercase terms. #[derive(Clone)] pub struct LowerCaser; -impl TokenFilter for LowerCaser -where - TailTokenStream: TokenStream, -{ - type ResultTokenStream = LowerCaserTokenStream; - - fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - LowerCaserTokenStream::wrap(token_stream) - } -} - -pub struct LowerCaserTokenStream { +pub struct LowerCaserTokenStream<'a> { buffer: String, - tail: TailTokenStream, + tail: BoxTokenStream<'a>, } // writes a lowercased version of text into output. @@ -31,18 +30,7 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) { } } -impl TokenStream for LowerCaserTokenStream -where - TailTokenStream: TokenStream, -{ - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } - +impl<'a> TokenStream for LowerCaserTokenStream<'a> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -56,26 +44,19 @@ where } true } -} -impl LowerCaserTokenStream -where - TailTokenStream: TokenStream, -{ - fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { - LowerCaserTokenStream { - tail, - buffer: String::with_capacity(100), - } + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() } } #[cfg(test)] mod tests { - use crate::tokenizer::LowerCaser; - use crate::tokenizer::SimpleTokenizer; - use crate::tokenizer::TokenStream; - use crate::tokenizer::Tokenizer; + use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer}; #[test] fn test_to_lower_case() { @@ -87,7 +68,9 @@ mod tests { fn lowercase_helper(text: &str) -> Vec { let mut tokens = vec![]; - let mut token_stream = SimpleTokenizer.filter(LowerCaser).token_stream(text); + let mut token_stream = TextAnalyzer::from(SimpleTokenizer) + .filter(LowerCaser) + .token_stream(text); while token_stream.advance() { let token_text = token_stream.token().text.clone(); tokens.push(token_text); diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index d0aaaab9a..7b5772393 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -64,7 +64,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let en_stem = SimpleTokenizer +//! let en_stem = TextAnalyzer::from(SimpleTokenizer) //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser) //! .filter(Stemmer::new(Language::English)); @@ -109,7 +109,7 @@ //! let index = Index::create_in_ram(schema); //! //! // We need to register our tokenizer : -//! let custom_en_tokenizer = SimpleTokenizer +//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer) //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser); //! index @@ -143,10 +143,11 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::{Language, Stemmer}; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::BoxedTokenizer; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; -pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; +pub use self::tokenizer::{ + BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, +}; pub use self::tokenizer_manager::TokenizerManager; @@ -160,9 +161,9 @@ pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4; #[cfg(test)] pub mod tests { use super::{ - Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, Tokenizer, - TokenizerManager, + Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager, }; + use crate::tokenizer::TextAnalyzer; /// This is a function that can be used in tests and doc tests /// to assert a token's correctness. @@ -229,7 +230,7 @@ pub mod tests { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "el_stem", - SimpleTokenizer + TextAnalyzer::from(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(Stemmer::new(Language::Greek)), diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 50aeca9a6..8bd82b79d 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -1,4 +1,5 @@ use super::{Token, TokenStream, Tokenizer}; +use crate::tokenizer::BoxTokenStream; /// Tokenize the text by splitting words into n-grams of the given size(s) /// @@ -129,11 +130,9 @@ pub struct NgramTokenStream<'a> { token: Token, } -impl<'a> Tokenizer<'a> for NgramTokenizer { - type TokenStreamImpl = NgramTokenStream<'a>; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { - NgramTokenStream { +impl Tokenizer for NgramTokenizer { + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + From::from(NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( CodepointFrontiers::for_str(text), self.min_gram, @@ -142,7 +141,7 @@ impl<'a> Tokenizer<'a> for NgramTokenizer { prefix_only: self.prefix_only, text, token: Token::default(), - } + }) } } @@ -308,10 +307,10 @@ mod tests { use super::NgramTokenizer; use super::StutteringIterator; use crate::tokenizer::tests::assert_token; - use crate::tokenizer::tokenizer::{TokenStream, Tokenizer}; - use crate::tokenizer::Token; + use crate::tokenizer::tokenizer::Tokenizer; + use crate::tokenizer::{BoxTokenStream, Token}; - fn test_helper(mut tokenizer: T) -> Vec { + fn test_helper(mut tokenizer: BoxTokenStream) -> Vec { let mut tokens: Vec = vec![]; tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); tokens diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 8827d537e..7de93190b 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,4 +1,5 @@ use super::{Token, TokenStream, Tokenizer}; +use crate::tokenizer::BoxTokenStream; /// For each value of the field, emit a single unprocessed token. #[derive(Clone)] @@ -9,10 +10,8 @@ pub struct RawTokenStream { has_token: bool, } -impl<'a> Tokenizer<'a> for RawTokenizer { - type TokenStreamImpl = RawTokenStream; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { +impl Tokenizer for RawTokenizer { + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { let token = Token { offset_from: 0, offset_to: text.len(), @@ -24,6 +23,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer { token, has_token: true, } + .into() } } diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 31b824f26..fb510df7c 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = SimpleTokenizer +//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) //! .filter(RemoveLongFilter::limit(5)); //! //! let mut stream = tokenizer.token_stream("toolong nice"); @@ -13,6 +13,7 @@ //! ``` //! use super::{Token, TokenFilter, TokenStream}; +use crate::tokenizer::BoxTokenStream; /// `RemoveLongFilter` removes tokens that are longer /// than a given number of bytes (in UTF-8 representation). @@ -31,56 +32,27 @@ impl RemoveLongFilter { } } -impl RemoveLongFilterStream -where - TailTokenStream: TokenStream, -{ +impl<'a> RemoveLongFilterStream<'a> { fn predicate(&self, token: &Token) -> bool { token.text.len() < self.token_length_limit } +} - fn wrap( - token_length_limit: usize, - tail: TailTokenStream, - ) -> RemoveLongFilterStream { - RemoveLongFilterStream { - token_length_limit, - tail, - } +impl TokenFilter for RemoveLongFilter { + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { + BoxTokenStream::from(RemoveLongFilterStream { + token_length_limit: self.length_limit, + tail: token_stream, + }) } } -impl TokenFilter for RemoveLongFilter -where - TailTokenStream: TokenStream, -{ - type ResultTokenStream = RemoveLongFilterStream; - - fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - RemoveLongFilterStream::wrap(self.length_limit, token_stream) - } -} - -pub struct RemoveLongFilterStream -where - TailTokenStream: TokenStream, -{ +pub struct RemoveLongFilterStream<'a> { token_length_limit: usize, - tail: TailTokenStream, + tail: BoxTokenStream<'a>, } -impl TokenStream for RemoveLongFilterStream -where - TailTokenStream: TokenStream, -{ - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } - +impl<'a> TokenStream for RemoveLongFilterStream<'a> { fn advance(&mut self) -> bool { while self.tail.advance() { if self.predicate(self.tail.token()) { @@ -89,4 +61,12 @@ where } false } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } } diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index 53422a290..7a64071b8 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -1,3 +1,4 @@ +use super::BoxTokenStream; use super::{Token, TokenStream, Tokenizer}; use std::str::CharIndices; @@ -11,15 +12,13 @@ pub struct SimpleTokenStream<'a> { token: Token, } -impl<'a> Tokenizer<'a> for SimpleTokenizer { - type TokenStreamImpl = SimpleTokenStream<'a>; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { - SimpleTokenStream { +impl Tokenizer for SimpleTokenizer { + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + BoxTokenStream::from(SimpleTokenStream { text, chars: text.char_indices(), token: Token::default(), - } + }) } } diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index ec2547249..afa9a5249 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,4 +1,5 @@ use super::{Token, TokenFilter, TokenStream}; +use crate::tokenizer::BoxTokenStream; use rust_stemmers::{self, Algorithm}; /// Available stemmer languages. @@ -75,38 +76,22 @@ impl Default for Stemmer { } } -impl TokenFilter for Stemmer -where - TailTokenStream: TokenStream, -{ - type ResultTokenStream = StemmerTokenStream; - - fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { +impl TokenFilter for Stemmer { + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); - StemmerTokenStream::wrap(inner_stemmer, token_stream) + BoxTokenStream::from(StemmerTokenStream { + tail: token_stream, + stemmer: inner_stemmer, + }) } } -pub struct StemmerTokenStream -where - TailTokenStream: TokenStream, -{ - tail: TailTokenStream, +pub struct StemmerTokenStream<'a> { + tail: BoxTokenStream<'a>, stemmer: rust_stemmers::Stemmer, } -impl TokenStream for StemmerTokenStream -where - TailTokenStream: TokenStream, -{ - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } - +impl<'a> TokenStream for StemmerTokenStream<'a> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -117,16 +102,12 @@ where self.token_mut().text.push_str(&stemmed_str); true } -} -impl StemmerTokenStream -where - TailTokenStream: TokenStream, -{ - fn wrap( - stemmer: rust_stemmers::Stemmer, - tail: TailTokenStream, - ) -> StemmerTokenStream { - StemmerTokenStream { tail, stemmer } + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() } } diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index e554a330e..b2551eb48 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = SimpleTokenizer +//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) //! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])); //! //! let mut stream = tokenizer.token_stream("the fox is crafty"); @@ -11,6 +11,7 @@ //! assert!(stream.next().is_none()); //! ``` use super::{Token, TokenFilter, TokenStream}; +use crate::tokenizer::BoxTokenStream; use fnv::FnvHasher; use std::collections::HashSet; use std::hash::BuildHasherDefault; @@ -48,53 +49,27 @@ impl StopWordFilter { } } -pub struct StopWordFilterStream -where - TailTokenStream: TokenStream, -{ +pub struct StopWordFilterStream<'a> { words: StopWordHashSet, - tail: TailTokenStream, + tail: BoxTokenStream<'a>, } -impl TokenFilter for StopWordFilter -where - TailTokenStream: TokenStream, -{ - type ResultTokenStream = StopWordFilterStream; - - fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - StopWordFilterStream::wrap(self.words.clone(), token_stream) +impl TokenFilter for StopWordFilter { + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { + BoxTokenStream::from(StopWordFilterStream { + words: self.words.clone(), + tail: token_stream, + }) } } -impl StopWordFilterStream -where - TailTokenStream: TokenStream, -{ +impl<'a> StopWordFilterStream<'a> { fn predicate(&self, token: &Token) -> bool { !self.words.contains(&token.text) } - - fn wrap( - words: StopWordHashSet, - tail: TailTokenStream, - ) -> StopWordFilterStream { - StopWordFilterStream { words, tail } - } } -impl TokenStream for StopWordFilterStream -where - TailTokenStream: TokenStream, -{ - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } - +impl<'a> TokenStream for StopWordFilterStream<'a> { fn advance(&mut self) -> bool { while self.tail.advance() { if self.predicate(self.tail.token()) { @@ -103,6 +78,14 @@ where } false } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } } impl Default for StopWordFilter { diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index f6f6e66b2..cf8ef0206 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -1,23 +1,21 @@ -use crate::tokenizer::{Token, TokenStream}; +use crate::tokenizer::{BoxTokenStream, Token, TokenStream}; +use std::ops::DerefMut; const POSITION_GAP: usize = 2; -pub(crate) struct TokenStreamChain { +pub(crate) struct TokenStreamChain<'a> { offsets: Vec, - token_streams: Vec, + token_streams: Vec>, position_shift: usize, stream_idx: usize, token: Token, } -impl<'a, TTokenStream> TokenStreamChain -where - TTokenStream: TokenStream, -{ +impl<'a> TokenStreamChain<'a> { pub fn new( offsets: Vec, - token_streams: Vec, - ) -> TokenStreamChain { + token_streams: Vec>, + ) -> TokenStreamChain<'a> { TokenStreamChain { offsets, stream_idx: 0, @@ -28,13 +26,10 @@ where } } -impl<'a, TTokenStream> TokenStream for TokenStreamChain -where - TTokenStream: TokenStream, -{ +impl<'a> TokenStream for TokenStreamChain<'a> { fn advance(&mut self) -> bool { while self.stream_idx < self.token_streams.len() { - let token_stream = &mut self.token_streams[self.stream_idx]; + let token_stream = self.token_streams[self.stream_idx].deref_mut(); if token_stream.advance() { let token = token_stream.token(); let offset_offset = self.offsets[self.stream_idx]; diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index 50da55e40..609bf2f49 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -1,4 +1,4 @@ -use crate::tokenizer::{Token, TokenStream, TokenStreamChain}; +use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain}; use std::cmp::Ordering; /// Struct representing pre-tokenized text @@ -41,9 +41,9 @@ impl PreTokenizedStream { /// Creates a TokenStream from PreTokenizedString array pub fn chain_tokenized_strings<'a>( tok_strings: &'a [&'a PreTokenizedString], - ) -> Box { + ) -> BoxTokenStream { if tok_strings.len() == 1 { - Box::new(PreTokenizedStream::from((*tok_strings[0]).clone())) + PreTokenizedStream::from((*tok_strings[0]).clone()).into() } else { let mut offsets = vec![]; let mut total_offset = 0; @@ -53,11 +53,12 @@ impl PreTokenizedStream { total_offset += last_token.offset_to; } } - let token_streams: Vec<_> = tok_strings + // TODO remove the string cloning. + let token_streams: Vec> = tok_strings .iter() - .map(|tok_string| PreTokenizedStream::from((*tok_string).clone())) + .map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into()) .collect(); - Box::new(TokenStreamChain::new(offsets, token_streams)) + TokenStreamChain::new(offsets, token_streams).into() } } } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 78a0eaf4b..0a6f61758 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -2,6 +2,7 @@ use crate::tokenizer::TokenStreamChain; /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. use std::borrow::{Borrow, BorrowMut}; +use std::ops::{Deref, DerefMut}; /// Token #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] @@ -33,20 +34,31 @@ impl Default for Token { } } -/// `Tokenizer` are in charge of splitting text into a stream of token -/// before indexing. +/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// -/// See the [module documentation](./index.html) for more detail. -/// -/// # Warning -/// -/// This API may change to use associated types. -pub trait Tokenizer<'a>: Sized + Clone { - /// Type associated to the resulting tokenstream tokenstream. - type TokenStreamImpl: TokenStream; +/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. +pub struct TextAnalyzer { + tokenizer: Box, + token_filters: Vec, +} - /// Creates a token stream for a given `str`. - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl; +impl From for TextAnalyzer { + fn from(tokenizer: T) -> Self { + TextAnalyzer::new(tokenizer, Vec::new()) + } +} + +impl TextAnalyzer { + /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`. + /// + /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using + /// `TextAnalyzer::from(tokenizer)`. + pub fn new(tokenizer: T, token_filters: Vec) -> TextAnalyzer { + TextAnalyzer { + tokenizer: Box::new(tokenizer), + token_filters, + } + } /// Appends a token filter to the current tokenizer. /// @@ -58,90 +70,26 @@ pub trait Tokenizer<'a>: Sized + Clone { /// ```rust /// use tantivy::tokenizer::*; /// - /// let en_stem = SimpleTokenizer + /// let en_stem = TextAnalyzer::from(SimpleTokenizer) /// .filter(RemoveLongFilter::limit(40)) /// .filter(LowerCaser) /// .filter(Stemmer::default()); /// ``` /// - fn filter(self, new_filter: NewFilter) -> ChainTokenizer - where - NewFilter: TokenFilter<>::TokenStreamImpl>, - { - ChainTokenizer { - head: new_filter, - tail: self, - } - } -} - -/// A boxed tokenizer -trait BoxedTokenizerTrait: Send + Sync { - /// Tokenize a `&str` - fn token_stream<'a>(&self, text: &'a str) -> Box; - - /// Tokenize an array`&str` - /// - /// The resulting `TokenStream` is equivalent to what would be obtained if the &str were - /// one concatenated `&str`, with an artificial position gap of `2` between the different fields - /// to prevent accidental `PhraseQuery` to match accross two terms. - fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box; - - /// Return a boxed clone of the tokenizer - fn boxed_clone(&self) -> BoxedTokenizer; -} - -/// A boxed tokenizer -pub struct BoxedTokenizer(Box); - -impl From for BoxedTokenizer -where - T: 'static + Send + Sync + for<'a> Tokenizer<'a>, -{ - fn from(tokenizer: T) -> BoxedTokenizer { - BoxedTokenizer(Box::new(BoxableTokenizer(tokenizer))) - } -} - -impl BoxedTokenizer { - /// Tokenize a `&str` - pub fn token_stream<'a>(&self, text: &'a str) -> Box { - self.0.token_stream(text) + pub fn filter>(mut self, token_filter: F) -> Self { + self.token_filters.push(token_filter.into()); + self } /// Tokenize an array`&str` /// - /// The resulting `TokenStream` is equivalent to what would be obtained if the &str were + /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were /// one concatenated `&str`, with an artificial position gap of `2` between the different fields /// to prevent accidental `PhraseQuery` to match accross two terms. - pub fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box { - self.0.token_stream_texts(texts) - } -} - -impl Clone for BoxedTokenizer { - fn clone(&self) -> BoxedTokenizer { - self.0.boxed_clone() - } -} - -#[derive(Clone)] -struct BoxableTokenizer(A) -where - A: for<'a> Tokenizer<'a> + Send + Sync; - -impl BoxedTokenizerTrait for BoxableTokenizer -where - A: 'static + Send + Sync + for<'a> Tokenizer<'a>, -{ - fn token_stream<'a>(&self, text: &'a str) -> Box { - Box::new(self.0.token_stream(text)) - } - - fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box { + pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> { assert!(!texts.is_empty()); if texts.len() == 1 { - Box::new(self.0.token_stream(texts[0])) + self.token_stream(texts[0]) } else { let mut offsets = vec![]; let mut total_offset = 0; @@ -149,34 +97,124 @@ where offsets.push(total_offset); total_offset += text.len(); } - let token_streams: Vec<_> = - texts.iter().map(|text| self.0.token_stream(text)).collect(); - Box::new(TokenStreamChain::new(offsets, token_streams)) + let token_streams: Vec> = texts + .iter() + .cloned() + .map(|text| self.token_stream(text)) + .collect(); + From::from(TokenStreamChain::new(offsets, token_streams)) } } - fn boxed_clone(&self) -> BoxedTokenizer { - self.0.clone().into() + /// Creates a token stream for a given `str`. + pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + let mut token_stream = self.tokenizer.token_stream(text); + for token_filter in &self.token_filters { + token_stream = token_filter.transform(token_stream); + } + token_stream } } -impl<'b> TokenStream for Box { +impl Clone for TextAnalyzer { + fn clone(&self) -> Self { + TextAnalyzer { + tokenizer: self.tokenizer.box_clone(), + token_filters: self + .token_filters + .iter() + .map(|token_filter| token_filter.box_clone()) + .collect(), + } + } +} + +/// `Tokenizer` are in charge of splitting text into a stream of token +/// before indexing. +/// +/// See the [module documentation](./index.html) for more detail. +/// +/// # Warning +/// +/// This API may change to use associated types. +pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { + /// Creates a token stream for a given `str`. + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; +} + +pub trait TokenizerClone { + fn box_clone(&self) -> Box; +} + +impl TokenizerClone for T { + fn box_clone(&self) -> Box { + Box::new(self.clone()) + } +} + +impl<'a> TokenStream for Box { fn advance(&mut self) -> bool { let token_stream: &mut dyn TokenStream = self.borrow_mut(); token_stream.advance() } - fn token(&self) -> &Token { - let token_stream: &dyn TokenStream = self.borrow(); + fn token<'b>(&'b self) -> &'b Token { + let token_stream: &'b (dyn TokenStream + 'a) = self.borrow(); token_stream.token() } - fn token_mut(&mut self) -> &mut Token { - let token_stream: &mut dyn TokenStream = self.borrow_mut(); + fn token_mut<'b>(&'b mut self) -> &'b mut Token { + let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut(); token_stream.token_mut() } } +/// Simple wrapper of `Box`. +/// +/// See `TokenStream` for more information. +pub struct BoxTokenStream<'a>(Box); + +impl<'a, T> From for BoxTokenStream<'a> +where + T: TokenStream + 'a, +{ + fn from(token_stream: T) -> BoxTokenStream<'a> { + BoxTokenStream(Box::new(token_stream)) + } +} + +impl<'a> Deref for BoxTokenStream<'a> { + type Target = dyn TokenStream + 'a; + + fn deref(&self) -> &Self::Target { + &*self.0 + } +} +impl<'a> DerefMut for BoxTokenStream<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut *self.0 + } +} + +/// Simple wrapper of `Box`. +/// +/// See `TokenStream` for more information. +pub struct BoxTokenFilter(Box); + +impl Deref for BoxTokenFilter { + type Target = dyn TokenFilter; + + fn deref(&self) -> &dyn TokenFilter { + &*self.0 + } +} + +impl From for BoxTokenFilter { + fn from(tokenizer: T) -> BoxTokenFilter { + BoxTokenFilter(Box::new(tokenizer)) + } +} + /// `TokenStream` is the result of the tokenization. /// /// It consists consumable stream of `Token`s. @@ -186,7 +224,7 @@ impl<'b> TokenStream for Box { /// ``` /// use tantivy::tokenizer::*; /// -/// let tokenizer = SimpleTokenizer +/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) /// .filter(RemoveLongFilter::limit(40)) /// .filter(LowerCaser); /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); @@ -225,7 +263,7 @@ pub trait TokenStream { /// ``` /// use tantivy::tokenizer::*; /// - /// let tokenizer = SimpleTokenizer + /// let tokenizer = TextAnalyzer::from(SimpleTokenizer) /// .filter(RemoveLongFilter::limit(40)) /// .filter(LowerCaser); /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); @@ -243,6 +281,8 @@ pub trait TokenStream { /// Helper function to consume the entire `TokenStream` /// and push the tokens to a sink function. + /// + /// Remove this. fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 { let mut num_tokens_pushed = 0u32; while self.advance() { @@ -253,33 +293,20 @@ pub trait TokenStream { } } -#[derive(Clone)] -pub struct ChainTokenizer { - head: HeadTokenFilterFactory, - tail: TailTokenizer, -} - -impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a> - for ChainTokenizer -where - HeadTokenFilterFactory: TokenFilter, - TailTokenizer: Tokenizer<'a>, -{ - type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { - let tail_token_stream = self.tail.token_stream(text); - self.head.transform(tail_token_stream) - } +pub trait TokenFilterClone { + fn box_clone(&self) -> BoxTokenFilter; } /// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: Clone { - /// The resulting `TokenStream` type. - type ResultTokenStream: TokenStream; - +pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { /// Wraps a token stream and returns the modified one. - fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream; + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; +} + +impl TokenFilterClone for T { + fn box_clone(&self) -> BoxTokenFilter { + BoxTokenFilter::from(self.clone()) + } } #[cfg(test)] diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 6ac46beea..89cf2407a 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,11 +1,10 @@ use crate::tokenizer::stemmer::Language; -use crate::tokenizer::BoxedTokenizer; +use crate::tokenizer::tokenizer::TextAnalyzer; use crate::tokenizer::LowerCaser; use crate::tokenizer::RawTokenizer; use crate::tokenizer::RemoveLongFilter; use crate::tokenizer::SimpleTokenizer; use crate::tokenizer::Stemmer; -use crate::tokenizer::Tokenizer; use std::collections::HashMap; use std::sync::{Arc, RwLock}; @@ -23,16 +22,16 @@ use std::sync::{Arc, RwLock}; /// search engine. #[derive(Clone)] pub struct TokenizerManager { - tokenizers: Arc>>, + tokenizers: Arc>>, } impl TokenizerManager { /// Registers a new tokenizer associated with a given name. - pub fn register(&self, tokenizer_name: &str, tokenizer: A) + pub fn register(&self, tokenizer_name: &str, tokenizer: T) where - A: Into, + TextAnalyzer: From, { - let boxed_tokenizer = tokenizer.into(); + let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer); self.tokenizers .write() .expect("Acquiring the lock should never fail") @@ -40,7 +39,7 @@ impl TokenizerManager { } /// Accessing a tokenizer given its name. - pub fn get(&self, tokenizer_name: &str) -> Option { + pub fn get(&self, tokenizer_name: &str) -> Option { self.tokenizers .read() .expect("Acquiring the lock should never fail") @@ -62,13 +61,13 @@ impl Default for TokenizerManager { manager.register("raw", RawTokenizer); manager.register( "default", - SimpleTokenizer + TextAnalyzer::from(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser), ); manager.register( "en_stem", - SimpleTokenizer + TextAnalyzer::from(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(Stemmer::new(Language::English)),