mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
* tokenizer-api: reduce Tokenizer overhead Previously a new `Token` for each text encountered was created, which contains `String::with_capacity(200)` In the new API the token_stream gets mutable access to the tokenizer, this allows state to be shared (in this PR Token is shared). Ideally the allocation for the BoxTokenStream would also be removed, but this may require some lifetime tricks. * simplify api * move lowercase and ascii folding buffer to global * empty Token text as default
23 lines
737 B
Rust
23 lines
737 B
Rust
use criterion::{criterion_group, criterion_main, Criterion};
|
|
use tantivy::tokenizer::TokenizerManager;
|
|
|
|
const ALICE_TXT: &str = include_str!("alice.txt");
|
|
|
|
pub fn criterion_benchmark(c: &mut Criterion) {
|
|
let tokenizer_manager = TokenizerManager::default();
|
|
let mut tokenizer = tokenizer_manager.get("default").unwrap();
|
|
c.bench_function("default-tokenize-alice", |b| {
|
|
b.iter(|| {
|
|
let mut word_count = 0;
|
|
let mut token_stream = tokenizer.token_stream(ALICE_TXT);
|
|
while token_stream.advance() {
|
|
word_count += 1;
|
|
}
|
|
assert_eq!(word_count, 30_731);
|
|
})
|
|
});
|
|
}
|
|
|
|
criterion_group!(benches, criterion_benchmark);
|
|
criterion_main!(benches);
|