mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-25 20:50:43 +00:00
* tokenizer-api: reduce Tokenizer overhead Previously a new `Token` for each text encountered was created, which contains `String::with_capacity(200)` In the new API the token_stream gets mutable access to the tokenizer, this allows state to be shared (in this PR Token is shared). Ideally the allocation for the BoxTokenStream would also be removed, but this may require some lifetime tricks. * simplify api * move lowercase and ascii folding buffer to global * empty Token text as default
43 lines
900 B
Rust
43 lines
900 B
Rust
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
|
|
|
#[derive(Clone)]
|
|
pub(crate) struct EmptyTokenizer;
|
|
|
|
impl Tokenizer for EmptyTokenizer {
|
|
type TokenStream<'a> = EmptyTokenStream;
|
|
fn token_stream(&mut self, _text: &str) -> EmptyTokenStream {
|
|
EmptyTokenStream::default()
|
|
}
|
|
}
|
|
|
|
#[derive(Default)]
|
|
pub struct EmptyTokenStream {
|
|
token: Token,
|
|
}
|
|
|
|
impl TokenStream for EmptyTokenStream {
|
|
fn advance(&mut self) -> bool {
|
|
false
|
|
}
|
|
|
|
fn token(&self) -> &super::Token {
|
|
&self.token
|
|
}
|
|
|
|
fn token_mut(&mut self) -> &mut super::Token {
|
|
&mut self.token
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::tokenizer::{TokenStream, Tokenizer};
|
|
|
|
#[test]
|
|
fn test_empty_tokenizer() {
|
|
let mut tokenizer = super::EmptyTokenizer;
|
|
let mut empty = tokenizer.token_stream("whatever string");
|
|
assert!(!empty.advance());
|
|
}
|
|
}
|