From ad9b825067c16c2301a87f830a38e5027ff31cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Tue, 20 Jun 2023 00:10:30 +0200 Subject: [PATCH] Add boxed token filter to ease the building of TextAnalyzer with a vec of filters. --- src/tokenizer/tokenizer.rs | 131 ++++++++++++++++++++++++++++++++----- tokenizer-api/src/lib.rs | 11 +++- 2 files changed, 125 insertions(+), 17 deletions(-) diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index ccab6cda7..011978fb2 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,6 +1,8 @@ +use std::ops::Deref; + /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. -use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer}; +use tokenizer_api::{BoxTokenStream, TokenFilter, TokenStream, Tokenizer}; use crate::tokenizer::empty_tokenizer::EmptyTokenizer; @@ -10,7 +12,7 @@ pub struct TextAnalyzer { } /// A boxable `Tokenizer`, with its `TokenStream` type erased. -trait BoxableTokenizer: 'static + Send + Sync { +pub trait BoxableTokenizer: 'static + Send + Sync { /// Creates a boxed token stream for a given `str`. fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>; /// Clone this tokenizer. @@ -26,6 +28,83 @@ impl BoxableTokenizer for T { } } +pub struct BoxedTokenizer(Box); + +impl Clone for BoxedTokenizer { + fn clone(&self) -> BoxedTokenizer { + Self(self.0.box_clone()) + } +} + +impl Tokenizer for BoxedTokenizer { + type TokenStream<'a> = Box; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + self.0.box_token_stream(text).into() + } +} + +/// Trait for the pluggable components of `Tokenizer`s. +pub trait BoxableTokenFilter: 'static + Send + Sync { + /// Wraps a Tokenizer and returns a new one. + fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box; +} + +impl BoxableTokenFilter for T { + fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box { + let tokenizer = self.clone().transform(tokenizer); + tokenizer.box_clone() + } +} + +pub struct BoxTokenFilter(Box); + +impl Deref for BoxTokenFilter { + type Target = dyn BoxableTokenFilter; + + fn deref(&self) -> &dyn BoxableTokenFilter { + &*self.0 + } +} + +impl From for BoxTokenFilter { + fn from(tokenizer: T) -> BoxTokenFilter { + BoxTokenFilter(Box::new(tokenizer)) + } +} + +impl TextAnalyzer { + /// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`. + /// + /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using + /// `TextAnalyzer::from(tokenizer)`. + /// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`, + /// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()`. + pub fn build( + tokenizer: T, + boxed_token_filters: Vec, + ) -> TextAnalyzer { + let mut boxed_tokenizer = BoxedTokenizer(Box::new(tokenizer)); + for filter in boxed_token_filters.into_iter() { + let filtered_boxed_tokenizer = filter.box_transform(boxed_tokenizer); + boxed_tokenizer = BoxedTokenizer(filtered_boxed_tokenizer); + } + TextAnalyzer { + tokenizer: boxed_tokenizer.0, + } + } + + /// Create a new TextAnalyzerBuilder + pub fn builder(tokenizer: T) -> TextAnalyzerBuilder { + TextAnalyzerBuilder { tokenizer } + } + + /// Creates a token stream for a given `str`. + pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { + self.tokenizer.box_token_stream(text) + } +} + impl Clone for TextAnalyzer { fn clone(&self) -> Self { TextAnalyzer { @@ -46,20 +125,8 @@ impl From for TextAnalyzer { } } -impl TextAnalyzer { - /// Create a new TextAnalyzerBuilder - pub fn builder(tokenizer: T) -> TextAnalyzerBuilder { - TextAnalyzerBuilder { tokenizer } - } - - /// Creates a token stream for a given `str`. - pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { - self.tokenizer.box_token_stream(text) - } -} - /// Builder helper for [`TextAnalyzer`] -pub struct TextAnalyzerBuilder { +pub struct TextAnalyzerBuilder { tokenizer: T, } @@ -90,3 +157,37 @@ impl TextAnalyzerBuilder { } } } + +#[cfg(test)] +mod tests { + + use super::*; + use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer}; + + #[test] + fn test_text_analyzer_builder() { + let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default()) + .filter(AlphaNumOnlyFilter) + .filter(RemoveLongFilter::limit(6)) + .filter(LowerCaser) + .build(); + let mut stream = analyzer.token_stream("- first bullet point"); + assert_eq!(stream.next().unwrap().text, "first"); + assert_eq!(stream.next().unwrap().text, "point"); + } + + #[test] + fn test_text_analyzer_with_filters_boxed() { + let mut analyzer = TextAnalyzer::build( + WhitespaceTokenizer::default(), + vec![ + BoxTokenFilter::from(AlphaNumOnlyFilter), + BoxTokenFilter::from(LowerCaser), + BoxTokenFilter::from(RemoveLongFilter::limit(6)), + ], + ); + let mut stream = analyzer.token_stream("- first bullet point"); + assert_eq!(stream.next().unwrap().text, "first"); + assert_eq!(stream.next().unwrap().text, "point"); + } +} diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index adb37a0b4..41312a61c 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -63,6 +63,12 @@ pub trait Tokenizer: 'static + Clone + Send + Sync { /// Simple wrapper of `Box`. pub struct BoxTokenStream<'a>(Box); +impl<'a> From> for Box { + fn from(token_stream: BoxTokenStream<'a>) -> Self { + token_stream.0 + } +} + impl<'a, T> From for BoxTokenStream<'a> where T: TokenStream + 'a { @@ -78,6 +84,7 @@ impl<'a> Deref for BoxTokenStream<'a> { &*self.0 } } + impl<'a> DerefMut for BoxTokenStream<'a> { fn deref_mut(&mut self) -> &mut Self::Target { &mut *self.0 @@ -137,11 +144,11 @@ pub trait TokenStream { } /// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: 'static + Send + Sync { +pub trait TokenFilter: 'static + Send + Sync + Clone { /// The Tokenizer type returned by this filter, typically parametrized by the underlying /// Tokenizer. type Tokenizer: Tokenizer; - /// Wraps a Tokenizer and returns a new one. + /// Wraps a Tokenizer and returns a new onex . fn transform(self, tokenizer: T) -> Self::Tokenizer; }