Add boxed token filter to ease the building of TextAnalyzer with a vec of filters.

This commit is contained in:
François Massot
2023-06-20 00:10:30 +02:00
parent 44850e1036
commit ad9b825067
2 changed files with 125 additions and 17 deletions

View File

@@ -1,6 +1,8 @@
use std::ops::Deref;
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use tokenizer_api::{BoxTokenStream, TokenFilter, TokenStream, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
@@ -10,7 +12,7 @@ pub struct TextAnalyzer {
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync {
pub trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
@@ -26,6 +28,83 @@ impl<T: Tokenizer> BoxableTokenizer for T {
}
}
pub struct BoxedTokenizer(Box<dyn BoxableTokenizer>);
impl Clone for BoxedTokenizer {
fn clone(&self) -> BoxedTokenizer {
Self(self.0.box_clone())
}
}
impl Tokenizer for BoxedTokenizer {
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.0.box_token_stream(text).into()
}
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait BoxableTokenFilter: 'static + Send + Sync {
/// Wraps a Tokenizer and returns a new one.
fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box<dyn BoxableTokenizer>;
}
impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box<dyn BoxableTokenizer> {
let tokenizer = self.clone().transform(tokenizer);
tokenizer.box_clone()
}
}
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn BoxableTokenFilter;
fn deref(&self) -> &dyn BoxableTokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
impl TextAnalyzer {
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()`.
pub fn build<T: Tokenizer>(
tokenizer: T,
boxed_token_filters: Vec<BoxTokenFilter>,
) -> TextAnalyzer {
let mut boxed_tokenizer = BoxedTokenizer(Box::new(tokenizer));
for filter in boxed_token_filters.into_iter() {
let filtered_boxed_tokenizer = filter.box_transform(boxed_tokenizer);
boxed_tokenizer = BoxedTokenizer(filtered_boxed_tokenizer);
}
TextAnalyzer {
tokenizer: boxed_tokenizer.0,
}
}
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.box_token_stream(text)
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
@@ -46,20 +125,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
}
}
impl TextAnalyzer {
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.box_token_stream(text)
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T> {
pub struct TextAnalyzerBuilder<T: Tokenizer> {
tokenizer: T,
}
@@ -90,3 +157,37 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
#[test]
fn test_text_analyzer_builder() {
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(AlphaNumOnlyFilter)
.filter(RemoveLongFilter::limit(6))
.filter(LowerCaser)
.build();
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
#[test]
fn test_text_analyzer_with_filters_boxed() {
let mut analyzer = TextAnalyzer::build(
WhitespaceTokenizer::default(),
vec![
BoxTokenFilter::from(AlphaNumOnlyFilter),
BoxTokenFilter::from(LowerCaser),
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
],
);
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
}

View File

@@ -63,6 +63,12 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a> From<BoxTokenStream<'a>> for Box<dyn TokenStream + 'a> {
fn from(token_stream: BoxTokenStream<'a>) -> Self {
token_stream.0
}
}
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
@@ -78,6 +84,7 @@ impl<'a> Deref for BoxTokenStream<'a> {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
@@ -137,11 +144,11 @@ pub trait TokenStream {
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync {
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;
/// Wraps a Tokenizer and returns a new one.
/// Wraps a Tokenizer and returns a new onex .
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
}