diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 0ce418ae8..532a21293 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -9,11 +9,15 @@ pub struct TextAnalyzer { tokenizer: Box, } -impl Tokenizer for Box { +/// Wrapper to avoid recursive acalls of `box_token_stream`. +#[derive(Clone)] +struct BoxedTokenizer(Box); + +impl Tokenizer for BoxedTokenizer { type TokenStream<'a> = BoxTokenStream<'a>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - self.box_token_stream(text) + self.0.box_token_stream(text) } } @@ -23,9 +27,9 @@ impl Clone for Box { } } -fn add_filter(tokenizer: Box, filter: F) -> Box { +fn add_filter(tokenizer: BoxedTokenizer, filter: F) -> BoxedTokenizer { let filtered_tokenizer = filter.transform(tokenizer); - Box::new(filtered_tokenizer) + BoxedTokenizer(Box::new(filtered_tokenizer)) } @@ -72,6 +76,11 @@ impl TextAnalyzer { TextAnalyzerBuilder { tokenizer } } + /// TODO + pub fn dynamic_filter_builder(tokenizer: T) -> DynamicTextAnalyzerBuilder { + DynamicTextAnalyzerBuilder::new(tokenizer) + } + /// Creates a token stream for a given `str`. pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { self.tokenizer.box_token_stream(text) @@ -110,3 +119,57 @@ impl TextAnalyzerBuilder { } } } + +/// Builder helper for [`TextAnalyzer`] with dynamic filters. +pub struct DynamicTextAnalyzerBuilder { + tokenizer: BoxedTokenizer, +} + +impl DynamicTextAnalyzerBuilder { + pub fn new(tokenizer: T) -> DynamicTextAnalyzerBuilder { + DynamicTextAnalyzerBuilder { tokenizer: BoxedTokenizer(Box::new(tokenizer)) } + } + + pub fn filter(self, filter: F) -> DynamicTextAnalyzerBuilder { + DynamicTextAnalyzerBuilder { + tokenizer: add_filter(self.tokenizer, filter), + } + } + + pub fn build(self) -> TextAnalyzer { + TextAnalyzer { + tokenizer: self.tokenizer.0, + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer}; + + #[test] + fn test_text_analyzer_builder() { + let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default()) + .filter(AlphaNumOnlyFilter) + .filter(RemoveLongFilter::limit(6)) + .filter(LowerCaser) + .build(); + let mut stream = analyzer.token_stream("- first bullet point"); + assert_eq!(stream.next().unwrap().text, "first"); + assert_eq!(stream.next().unwrap().text, "point"); + } + + #[test] + fn test_text_analyzer_with_filters_boxed() { + let mut analyzer = TextAnalyzer::dynamic_filter_builder(WhitespaceTokenizer::default()) + .filter(AlphaNumOnlyFilter) + .filter(RemoveLongFilter::limit(6)) + .filter(LowerCaser) + .build(); + let mut stream = analyzer.token_stream("- first bullet point"); + assert_eq!(stream.next().unwrap().text, "first"); + assert_eq!(stream.next().unwrap().text, "point"); + } +}