Refactor token filter.

This commit is contained in:
François Massot
2023-06-25 12:36:47 +02:00
parent dc783f8328
commit 2cab111f99
10 changed files with 134 additions and 195 deletions

View File

@@ -1,5 +1,7 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
use tantivy::tokenizer::{
BoxTokenFilter, LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
const ALICE_TXT: &str = include_str!("alice.txt");
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let token_filters = vec![
BoxTokenFilter::from(RemoveLongFilter::limit(40)),
BoxTokenFilter::from(LowerCaser),
];
let mut dynamic_analyzer = TextAnalyzer::new(SimpleTokenizer::default(), token_filters);
c.bench_function("default-dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_main!(benches);

View File

@@ -21,7 +21,7 @@
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
@@ -39,23 +39,10 @@ impl<T> AlphaNumOnlyFilterStream<T> {
}
impl TokenFilter for AlphaNumOnlyFilter {
type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
type OutputTokenStream<T: TokenStream> = AlphaNumOnlyFilterStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper<T> {
AlphaNumOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct AlphaNumOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
AlphaNumOnlyFilterStream {
tail: self.0.token_stream(text),
}
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
AlphaNumOnlyFilterStream { tail: token_stream }
}
}

View File

@@ -1,6 +1,6 @@
use std::mem;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
@@ -9,48 +9,30 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
pub struct AsciiFoldingFilter;
impl TokenFilter for AsciiFoldingFilter {
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
type OutputTokenStream<T: TokenStream> = AsciiFoldingFilterTokenStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
AsciiFoldingFilterWrapper {
tokenizer,
buffer: String::new(),
}
}
}
#[derive(Clone)]
pub struct AsciiFoldingFilterWrapper<T> {
tokenizer: T,
buffer: String,
}
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.buffer.clear();
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
AsciiFoldingFilterTokenStream {
buffer: &mut self.buffer,
tail: self.tokenizer.token_stream(text),
buffer: String::new(),
tail: token_stream,
}
}
}
pub struct AsciiFoldingFilterTokenStream<'a, T> {
buffer: &'a mut String,
pub struct AsciiFoldingFilterTokenStream<T> {
buffer: String,
tail: T,
}
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
if !self.token_mut().text.is_ascii() {
// ignore its already ascii
to_ascii(&self.tail.token().text, self.buffer);
mem::swap(&mut self.tail.token_mut().text, self.buffer);
to_ascii(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}

View File

@@ -1,42 +1,24 @@
use std::mem;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
impl TokenFilter for LowerCaser {
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
type OutputTokenStream<T: TokenStream> = LowerCaserTokenStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
LowerCaserFilter {
tokenizer,
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
LowerCaserTokenStream {
tail: token_stream,
buffer: String::new(),
}
}
}
#[derive(Clone)]
pub struct LowerCaserFilter<T> {
tokenizer: T,
pub struct LowerCaserTokenStream<T> {
buffer: String,
}
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.buffer.clear();
LowerCaserTokenStream {
tail: self.tokenizer.token_stream(text),
buffer: &mut self.buffer,
}
}
}
pub struct LowerCaserTokenStream<'a, T> {
buffer: &'a mut String,
tail: T,
}
@@ -51,7 +33,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
}
}
impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
@@ -60,8 +42,8 @@ impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&self.tail.token().text, self.buffer);
mem::swap(&mut self.tail.token_mut().text, self.buffer);
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}

View File

@@ -12,7 +12,7 @@
//! assert_eq!(stream.next().unwrap().text, "nice");
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
@@ -38,29 +38,12 @@ impl<T> RemoveLongFilterStream<T> {
}
impl TokenFilter for RemoveLongFilter {
type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
type OutputTokenStream<T: TokenStream> = RemoveLongFilterStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> RemoveLongFilterWrapper<T> {
RemoveLongFilterWrapper {
length_limit: self.length_limit,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct RemoveLongFilterWrapper<T: Tokenizer> {
length_limit: usize,
inner: T,
}
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: self.inner.token_stream(text),
tail: token_stream,
}
}
}

View File

@@ -1,6 +1,6 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
@@ -80,29 +80,12 @@ impl SplitCompoundWords {
}
impl TokenFilter for SplitCompoundWords {
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
type OutputTokenStream<T: TokenStream> = SplitCompoundWordsTokenStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
}
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
tail: token_stream,
cuts: Vec::new(),
parts: Vec::new(),
}

View File

@@ -4,7 +4,7 @@ use std::mem;
use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// Available stemmer languages.
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
@@ -81,29 +81,12 @@ impl Default for Stemmer {
}
impl TokenFilter for Stemmer {
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
type OutputTokenStream<T: TokenStream> = StemmerTokenStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
StemmerFilter {
stemmer_algorithm: self.stemmer_algorithm,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct StemmerFilter<T> {
stemmer_algorithm: Algorithm,
inner: T,
}
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
StemmerTokenStream {
tail: self.inner.token_stream(text),
tail: token_stream,
stemmer,
buffer: String::new(),
}

View File

@@ -21,7 +21,7 @@ use rustc_hash::FxHashSet;
#[cfg(feature = "stopwords")]
use super::Language;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)]
@@ -72,29 +72,12 @@ impl StopWordFilter {
}
impl TokenFilter for StopWordFilter {
type Tokenizer<T: Tokenizer> = StopWordFilterWrapper<T>;
type OutputTokenStream<T: TokenStream> = StopWordFilterStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> StopWordFilterWrapper<T> {
StopWordFilterWrapper {
words: self.words,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct StopWordFilterWrapper<T> {
words: Arc<FxHashSet<String>>,
inner: T,
}
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
StopWordFilterStream {
words: self.words.clone(),
tail: self.inner.token_stream(text),
tail: token_stream,
}
}
}

View File

@@ -1,14 +1,14 @@
use dyn_clone::DynClone;
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{TokenFilter, TokenStream, Tokenizer};
use tokenizer_api::{FilteredTokenizer, TokenFilter, TokenStream, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
#[derive(Clone)]
pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
token_filters: Vec<BoxTokenFilter>,
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
@@ -25,32 +25,30 @@ impl<T: Tokenizer> BoxableTokenizer for T {
dyn_clone::clone_trait_object!(BoxableTokenizer);
/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased.
#[derive(Clone)]
struct BoxTokenizer(Box<dyn BoxableTokenizer>);
impl Tokenizer for BoxTokenizer {
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.0.box_token_stream(text).into()
}
}
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
trait BoxableTokenFilter: 'static + Send + Sync {
/// Wraps a `BoxedTokenizer` and returns a new one.
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer;
trait BoxableTokenFilter: 'static + Send + Sync + DynClone {
/// Transforms a boxed token stream into a new one.
fn box_transform<'a>(
&self,
token_stream: Box<dyn TokenStream + 'a>,
) -> Box<dyn TokenStream + 'a>;
}
impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer {
let tokenizer = self.clone().transform(tokenizer);
BoxTokenizer(Box::new(tokenizer))
fn box_transform<'a>(
&self,
token_stream: Box<dyn TokenStream + 'a>,
) -> Box<dyn TokenStream + 'a> {
Box::new(self.clone().filter(token_stream))
}
}
/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased.
dyn_clone::clone_trait_object!(BoxableTokenFilter);
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See [`TokenFilter`] for more information.
#[derive(Clone)]
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
impl<T: TokenFilter> From<T> for BoxTokenFilter {
@@ -59,6 +57,19 @@ impl<T: TokenFilter> From<T> for BoxTokenFilter {
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.clone(),
token_filters: self
.token_filters
.iter()
.map(|token_filter| token_filter.clone())
.collect(),
}
}
}
impl TextAnalyzer {
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
@@ -71,7 +82,7 @@ impl TextAnalyzer {
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::build(
/// let en_stem = TextAnalyzer::new(
/// SimpleTokenizer::default(),
/// vec![
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
@@ -79,27 +90,25 @@ impl TextAnalyzer {
/// BoxTokenFilter::from(Stemmer::default()),
/// ]);
/// ```
pub fn build<T: Tokenizer>(
tokenizer: T,
boxed_token_filters: Vec<BoxTokenFilter>,
) -> TextAnalyzer {
let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer));
for filter in boxed_token_filters.into_iter() {
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
}
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer {
tokenizer: boxed_tokenizer.0,
tokenizer: Box::new(tokenizer),
token_filters,
}
}
/// Create a new TextAnalyzerBuilder
/// Create a new TextAnalyzerBuilder.
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
self.tokenizer.box_token_stream(text)
let mut token_stream = self.tokenizer.box_token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.0.box_transform(token_stream);
}
token_stream
}
}
@@ -134,7 +143,10 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
/// .filter(Stemmer::default())
/// .build();
/// ```
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
pub fn filter<F: TokenFilter>(
self,
token_filter: F,
) -> TextAnalyzerBuilder<FilteredTokenizer<T, F>> {
TextAnalyzerBuilder {
tokenizer: token_filter.transform(self.tokenizer),
}
@@ -144,6 +156,7 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
pub fn build(self) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(self.tokenizer),
token_filters: Vec::new(),
}
}
}
@@ -168,7 +181,7 @@ mod tests {
#[test]
fn test_text_analyzer_with_filters_boxed() {
let mut analyzer = TextAnalyzer::build(
let mut analyzer = TextAnalyzer::new(
WhitespaceTokenizer::default(),
vec![
BoxTokenFilter::from(AlphaNumOnlyFilter),

View File

@@ -115,9 +115,31 @@ pub trait TokenStream {
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;
type OutputTokenStream<T: TokenStream>: TokenStream;
/// Filter a token stream and returns a new one.
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T>;
/// Wraps a Tokenizer and returns a new one.
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> FilteredTokenizer<T, Self> {
FilteredTokenizer {
tokenizer,
token_filter: self,
}
}
}
#[derive(Clone)]
pub struct FilteredTokenizer<T: Tokenizer, F: TokenFilter> {
tokenizer: T,
token_filter: F,
}
impl<T: Tokenizer, F: TokenFilter> Tokenizer for FilteredTokenizer<T, F> {
type TokenStream<'a> = F::OutputTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
let token_stream = self.tokenizer.token_stream(text);
self.token_filter.filter(token_stream)
}
}
#[cfg(test)]