diff --git a/benches/analyzer.rs b/benches/analyzer.rs index 7a96fa119..eb6574e86 100644 --- a/benches/analyzer.rs +++ b/benches/analyzer.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use tantivy::tokenizer::TokenizerManager; +use tantivy::tokenizer::{TokenizerManager, TextAnalyzer, RemoveLongFilter, LowerCaser, SimpleTokenizer}; const ALICE_TXT: &str = include_str!("alice.txt"); @@ -16,7 +16,40 @@ pub fn criterion_benchmark(c: &mut Criterion) { assert_eq!(word_count, 30_731); }) }); + let mut static_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .build(); + c.bench_function("static-tokenize-alice", |b| { + b.iter(|| { + let mut word_count = 0; + let mut token_stream = static_analyzer.token_stream(ALICE_TXT); + while token_stream.advance() { + word_count += 1; + } + assert_eq!(word_count, 30_731); + }) + }); + let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) + .dynamic() + .filter_dynamic(RemoveLongFilter::limit(40)) + .filter_dynamic(LowerCaser) + .build(); + c.bench_function("dynamic-tokenize-alice", |b| { + b.iter(|| { + let mut word_count = 0; + let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT); + while token_stream.advance() { + word_count += 1; + } + assert_eq!(word_count, 30_731); + }) + }); } -criterion_group!(benches, criterion_benchmark); +criterion_group! { + name = benches; + config = Criterion::default().sample_size(200); + targets = criterion_benchmark +} criterion_main!(benches); diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 2bbb37256..da3760005 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -5,6 +5,7 @@ use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer}; use crate::tokenizer::empty_tokenizer::EmptyTokenizer; /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. +#[derive(Clone)] pub struct TextAnalyzer { tokenizer: Box, } @@ -19,7 +20,7 @@ impl Tokenizer for Box { impl Clone for Box { fn clone(&self) -> Self { - self.box_clone() + (**self).box_clone() } } @@ -40,14 +41,6 @@ impl BoxableTokenizer for T { } } -impl Clone for TextAnalyzer { - fn clone(&self) -> Self { - TextAnalyzer { - tokenizer: self.tokenizer.box_clone(), - } - } -} - impl Default for TextAnalyzer { fn default() -> TextAnalyzer { TextAnalyzer::from(EmptyTokenizer) @@ -97,7 +90,10 @@ impl TextAnalyzerBuilder { } } - // Boxes the internal tokenizer. This is useful to write generic code. + /// Boxes the internal tokenizer. This is useful to write generic code. + /// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`, + /// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it + /// will be more performant and create less boxes. pub fn dynamic(self) -> TextAnalyzerBuilder { let boxed_tokenizer = Box::new(self.tokenizer); TextAnalyzerBuilder { @@ -106,7 +102,6 @@ impl TextAnalyzerBuilder { } /// Apply a filter and returns a boxed version of the TextAnalyzerBuilder. - /// (If we prefer we can remove this method) pub fn filter_dynamic(self, token_filter: F) -> TextAnalyzerBuilder { self.filter(token_filter).dynamic() } @@ -124,18 +119,18 @@ impl TextAnalyzerBuilder { mod tests { use super::*; - use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer}; + use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer, SimpleTokenizer}; #[test] fn test_text_analyzer_builder() { - let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default()) - .filter(AlphaNumOnlyFilter) - .filter(RemoveLongFilter::limit(6)) + let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) - .build(); + .build() + .clone(); let mut stream = analyzer.token_stream("- first bullet point"); assert_eq!(stream.next().unwrap().text, "first"); - assert_eq!(stream.next().unwrap().text, "point"); + assert_eq!(stream.next().unwrap().text, "bullet"); } @@ -156,19 +151,20 @@ mod tests { SerializableTokenFilterEnum::LowerCaser(LowerCaser), SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)), ]; - let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(WhitespaceTokenizer::default()) - .dynamic(); - for filter in filters { - analyzer_builder = - match filter { - SerializableTokenFilterEnum::LowerCaser(lower_caser) => - analyzer_builder.filter_dynamic(lower_caser), - SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => { - analyzer_builder.filter_dynamic(remove_long_filter) - }, - } - } - let mut analyzer = analyzer_builder.build(); + let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(SimpleTokenizer::default()) + .filter_dynamic(RemoveLongFilter::limit(40)) + .filter_dynamic(LowerCaser); + // for filter in filters { + // analyzer_builder = + // match filter { + // SerializableTokenFilterEnum::LowerCaser(lower_caser) => + // analyzer_builder.filter_dynamic(lower_caser), + // SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => { + // analyzer_builder.filter_dynamic(remove_long_filter) + // }, + // } + // } + let mut analyzer = analyzer_builder.build().clone(); let mut stream = analyzer.token_stream("first bullet point"); assert_eq!(stream.next().unwrap().text, "first"); assert_eq!(stream.next().unwrap().text, "bullet");