This commit is contained in:
François Massot
2023-06-30 12:46:31 +02:00
committed by Paul Masurel
parent 98a3b01992
commit 81330aaf89
2 changed files with 61 additions and 32 deletions

View File

@@ -1,5 +1,5 @@
use criterion::{criterion_group, criterion_main, Criterion}; use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager; use tantivy::tokenizer::{TokenizerManager, TextAnalyzer, RemoveLongFilter, LowerCaser, SimpleTokenizer};
const ALICE_TXT: &str = include_str!("alice.txt"); const ALICE_TXT: &str = include_str!("alice.txt");
@@ -16,7 +16,40 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731); assert_eq!(word_count, 30_731);
}) })
}); });
let mut static_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.build();
c.bench_function("static-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = static_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic()
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser)
.build();
c.bench_function("dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
} }
criterion_group!(benches, criterion_benchmark); criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_main!(benches); criterion_main!(benches);

View File

@@ -5,6 +5,7 @@ use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer; use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
#[derive(Clone)]
pub struct TextAnalyzer { pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>, tokenizer: Box<dyn BoxableTokenizer>,
} }
@@ -19,7 +20,7 @@ impl Tokenizer for Box<dyn BoxableTokenizer> {
impl Clone for Box<dyn BoxableTokenizer> { impl Clone for Box<dyn BoxableTokenizer> {
fn clone(&self) -> Self { fn clone(&self) -> Self {
self.box_clone() (**self).box_clone()
} }
} }
@@ -40,14 +41,6 @@ impl<T: Tokenizer> BoxableTokenizer for T {
} }
} }
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
}
}
}
impl Default for TextAnalyzer { impl Default for TextAnalyzer {
fn default() -> TextAnalyzer { fn default() -> TextAnalyzer {
TextAnalyzer::from(EmptyTokenizer) TextAnalyzer::from(EmptyTokenizer)
@@ -97,7 +90,10 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
} }
} }
// Boxes the internal tokenizer. This is useful to write generic code. /// Boxes the internal tokenizer. This is useful to write generic code.
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
/// will be more performant and create less boxes.
pub fn dynamic(self) -> TextAnalyzerBuilder { pub fn dynamic(self) -> TextAnalyzerBuilder {
let boxed_tokenizer = Box::new(self.tokenizer); let boxed_tokenizer = Box::new(self.tokenizer);
TextAnalyzerBuilder { TextAnalyzerBuilder {
@@ -106,7 +102,6 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
} }
/// Apply a filter and returns a boxed version of the TextAnalyzerBuilder. /// Apply a filter and returns a boxed version of the TextAnalyzerBuilder.
/// (If we prefer we can remove this method)
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder { pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
self.filter(token_filter).dynamic() self.filter(token_filter).dynamic()
} }
@@ -124,18 +119,18 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
mod tests { mod tests {
use super::*; use super::*;
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer}; use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer, SimpleTokenizer};
#[test] #[test]
fn test_text_analyzer_builder() { fn test_text_analyzer_builder() {
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default()) let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(AlphaNumOnlyFilter) .filter(RemoveLongFilter::limit(40))
.filter(RemoveLongFilter::limit(6))
.filter(LowerCaser) .filter(LowerCaser)
.build(); .build()
.clone();
let mut stream = analyzer.token_stream("- first bullet point"); let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first"); assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point"); assert_eq!(stream.next().unwrap().text, "bullet");
} }
@@ -156,19 +151,20 @@ mod tests {
SerializableTokenFilterEnum::LowerCaser(LowerCaser), SerializableTokenFilterEnum::LowerCaser(LowerCaser),
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)), SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
]; ];
let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(WhitespaceTokenizer::default()) let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic(); .filter_dynamic(RemoveLongFilter::limit(40))
for filter in filters { .filter_dynamic(LowerCaser);
analyzer_builder = // for filter in filters {
match filter { // analyzer_builder =
SerializableTokenFilterEnum::LowerCaser(lower_caser) => // match filter {
analyzer_builder.filter_dynamic(lower_caser), // SerializableTokenFilterEnum::LowerCaser(lower_caser) =>
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => { // analyzer_builder.filter_dynamic(lower_caser),
analyzer_builder.filter_dynamic(remove_long_filter) // SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
}, // analyzer_builder.filter_dynamic(remove_long_filter)
} // },
} // }
let mut analyzer = analyzer_builder.build(); // }
let mut analyzer = analyzer_builder.build().clone();
let mut stream = analyzer.token_stream("first bullet point"); let mut stream = analyzer.token_stream("first bullet point");
assert_eq!(stream.next().unwrap().text, "first"); assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "bullet"); assert_eq!(stream.next().unwrap().text, "bullet");