mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Add boxed token filter to ease the building of TextAnalyzer with a vec of filters.
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
use std::ops::Deref;
|
||||
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||
use tokenizer_api::{BoxTokenStream, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
@@ -10,7 +12,7 @@ pub struct TextAnalyzer {
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
pub trait BoxableTokenizer: 'static + Send + Sync {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
@@ -26,6 +28,83 @@ impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BoxedTokenizer(Box<dyn BoxableTokenizer>);
|
||||
|
||||
impl Clone for BoxedTokenizer {
|
||||
fn clone(&self) -> BoxedTokenizer {
|
||||
Self(self.0.box_clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for BoxedTokenizer {
|
||||
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.0.box_token_stream(text).into()
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait BoxableTokenFilter: 'static + Send + Sync {
|
||||
/// Wraps a Tokenizer and returns a new one.
|
||||
fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box<dyn BoxableTokenizer>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> BoxableTokenFilter for T {
|
||||
fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box<dyn BoxableTokenizer> {
|
||||
let tokenizer = self.clone().transform(tokenizer);
|
||||
tokenizer.box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
|
||||
|
||||
impl Deref for BoxTokenFilter {
|
||||
type Target = dyn BoxableTokenFilter;
|
||||
|
||||
fn deref(&self) -> &dyn BoxableTokenFilter {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||
/// `TextAnalyzer::from(tokenizer)`.
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
|
||||
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()`.
|
||||
pub fn build<T: Tokenizer>(
|
||||
tokenizer: T,
|
||||
boxed_token_filters: Vec<BoxTokenFilter>,
|
||||
) -> TextAnalyzer {
|
||||
let mut boxed_tokenizer = BoxedTokenizer(Box::new(tokenizer));
|
||||
for filter in boxed_token_filters.into_iter() {
|
||||
let filtered_boxed_tokenizer = filter.box_transform(boxed_tokenizer);
|
||||
boxed_tokenizer = BoxedTokenizer(filtered_boxed_tokenizer);
|
||||
}
|
||||
TextAnalyzer {
|
||||
tokenizer: boxed_tokenizer.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
@@ -46,20 +125,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
pub struct TextAnalyzerBuilder<T: Tokenizer> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
@@ -90,3 +157,37 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_builder() {
|
||||
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.filter(LowerCaser)
|
||||
.build();
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_with_filters_boxed() {
|
||||
let mut analyzer = TextAnalyzer::build(
|
||||
WhitespaceTokenizer::default(),
|
||||
vec![
|
||||
BoxTokenFilter::from(AlphaNumOnlyFilter),
|
||||
BoxTokenFilter::from(LowerCaser),
|
||||
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
|
||||
],
|
||||
);
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,6 +63,12 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a> From<BoxTokenStream<'a>> for Box<dyn TokenStream + 'a> {
|
||||
fn from(token_stream: BoxTokenStream<'a>) -> Self {
|
||||
token_stream.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where T: TokenStream + 'a
|
||||
{
|
||||
@@ -78,6 +84,7 @@ impl<'a> Deref for BoxTokenStream<'a> {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
@@ -137,11 +144,11 @@ pub trait TokenStream {
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync {
|
||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
|
||||
/// Tokenizer.
|
||||
type Tokenizer<T: Tokenizer>: Tokenizer;
|
||||
/// Wraps a Tokenizer and returns a new one.
|
||||
/// Wraps a Tokenizer and returns a new onex .
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user