mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-26 12:09:57 +00:00
WIP
This commit is contained in:
committed by
Paul Masurel
parent
98a3b01992
commit
81330aaf89
@@ -1,5 +1,5 @@
|
|||||||
use criterion::{criterion_group, criterion_main, Criterion};
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
use tantivy::tokenizer::TokenizerManager;
|
use tantivy::tokenizer::{TokenizerManager, TextAnalyzer, RemoveLongFilter, LowerCaser, SimpleTokenizer};
|
||||||
|
|
||||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||||
|
|
||||||
@@ -16,7 +16,40 @@ pub fn criterion_benchmark(c: &mut Criterion) {
|
|||||||
assert_eq!(word_count, 30_731);
|
assert_eq!(word_count, 30_731);
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
let mut static_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.build();
|
||||||
|
c.bench_function("static-tokenize-alice", |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut word_count = 0;
|
||||||
|
let mut token_stream = static_analyzer.token_stream(ALICE_TXT);
|
||||||
|
while token_stream.advance() {
|
||||||
|
word_count += 1;
|
||||||
|
}
|
||||||
|
assert_eq!(word_count, 30_731);
|
||||||
|
})
|
||||||
|
});
|
||||||
|
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
|
.dynamic()
|
||||||
|
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||||
|
.filter_dynamic(LowerCaser)
|
||||||
|
.build();
|
||||||
|
c.bench_function("dynamic-tokenize-alice", |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut word_count = 0;
|
||||||
|
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
|
||||||
|
while token_stream.advance() {
|
||||||
|
word_count += 1;
|
||||||
|
}
|
||||||
|
assert_eq!(word_count, 30_731);
|
||||||
|
})
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(benches, criterion_benchmark);
|
criterion_group! {
|
||||||
|
name = benches;
|
||||||
|
config = Criterion::default().sample_size(200);
|
||||||
|
targets = criterion_benchmark
|
||||||
|
}
|
||||||
criterion_main!(benches);
|
criterion_main!(benches);
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
|||||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||||
|
|
||||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct TextAnalyzer {
|
pub struct TextAnalyzer {
|
||||||
tokenizer: Box<dyn BoxableTokenizer>,
|
tokenizer: Box<dyn BoxableTokenizer>,
|
||||||
}
|
}
|
||||||
@@ -19,7 +20,7 @@ impl Tokenizer for Box<dyn BoxableTokenizer> {
|
|||||||
|
|
||||||
impl Clone for Box<dyn BoxableTokenizer> {
|
impl Clone for Box<dyn BoxableTokenizer> {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
self.box_clone()
|
(**self).box_clone()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -40,14 +41,6 @@ impl<T: Tokenizer> BoxableTokenizer for T {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for TextAnalyzer {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
TextAnalyzer {
|
|
||||||
tokenizer: self.tokenizer.box_clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for TextAnalyzer {
|
impl Default for TextAnalyzer {
|
||||||
fn default() -> TextAnalyzer {
|
fn default() -> TextAnalyzer {
|
||||||
TextAnalyzer::from(EmptyTokenizer)
|
TextAnalyzer::from(EmptyTokenizer)
|
||||||
@@ -97,7 +90,10 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Boxes the internal tokenizer. This is useful to write generic code.
|
/// Boxes the internal tokenizer. This is useful to write generic code.
|
||||||
|
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
|
||||||
|
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
|
||||||
|
/// will be more performant and create less boxes.
|
||||||
pub fn dynamic(self) -> TextAnalyzerBuilder {
|
pub fn dynamic(self) -> TextAnalyzerBuilder {
|
||||||
let boxed_tokenizer = Box::new(self.tokenizer);
|
let boxed_tokenizer = Box::new(self.tokenizer);
|
||||||
TextAnalyzerBuilder {
|
TextAnalyzerBuilder {
|
||||||
@@ -106,7 +102,6 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Apply a filter and returns a boxed version of the TextAnalyzerBuilder.
|
/// Apply a filter and returns a boxed version of the TextAnalyzerBuilder.
|
||||||
/// (If we prefer we can remove this method)
|
|
||||||
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
|
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
|
||||||
self.filter(token_filter).dynamic()
|
self.filter(token_filter).dynamic()
|
||||||
}
|
}
|
||||||
@@ -124,18 +119,18 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
|
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer, SimpleTokenizer};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_text_analyzer_builder() {
|
fn test_text_analyzer_builder() {
|
||||||
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
|
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
.filter(AlphaNumOnlyFilter)
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(RemoveLongFilter::limit(6))
|
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.build();
|
.build()
|
||||||
|
.clone();
|
||||||
let mut stream = analyzer.token_stream("- first bullet point");
|
let mut stream = analyzer.token_stream("- first bullet point");
|
||||||
assert_eq!(stream.next().unwrap().text, "first");
|
assert_eq!(stream.next().unwrap().text, "first");
|
||||||
assert_eq!(stream.next().unwrap().text, "point");
|
assert_eq!(stream.next().unwrap().text, "bullet");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -156,19 +151,20 @@ mod tests {
|
|||||||
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
|
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
|
||||||
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
|
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
|
||||||
];
|
];
|
||||||
let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(WhitespaceTokenizer::default())
|
let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
.dynamic();
|
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||||
for filter in filters {
|
.filter_dynamic(LowerCaser);
|
||||||
analyzer_builder =
|
// for filter in filters {
|
||||||
match filter {
|
// analyzer_builder =
|
||||||
SerializableTokenFilterEnum::LowerCaser(lower_caser) =>
|
// match filter {
|
||||||
analyzer_builder.filter_dynamic(lower_caser),
|
// SerializableTokenFilterEnum::LowerCaser(lower_caser) =>
|
||||||
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
|
// analyzer_builder.filter_dynamic(lower_caser),
|
||||||
analyzer_builder.filter_dynamic(remove_long_filter)
|
// SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
|
||||||
},
|
// analyzer_builder.filter_dynamic(remove_long_filter)
|
||||||
}
|
// },
|
||||||
}
|
// }
|
||||||
let mut analyzer = analyzer_builder.build();
|
// }
|
||||||
|
let mut analyzer = analyzer_builder.build().clone();
|
||||||
let mut stream = analyzer.token_stream("first bullet point");
|
let mut stream = analyzer.token_stream("first bullet point");
|
||||||
assert_eq!(stream.next().unwrap().text, "first");
|
assert_eq!(stream.next().unwrap().text, "first");
|
||||||
assert_eq!(stream.next().unwrap().text, "bullet");
|
assert_eq!(stream.next().unwrap().text, "bullet");
|
||||||
|
|||||||
Reference in New Issue
Block a user