mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-24 20:20:44 +00:00
* tokenizer-api: reduce Tokenizer overhead Previously a new `Token` for each text encountered was created, which contains `String::with_capacity(200)` In the new API the token_stream gets mutable access to the tokenizer, this allows state to be shared (in this PR Token is shared). Ideally the allocation for the BoxTokenStream would also be removed, but this may require some lifetime tricks. * simplify api * move lowercase and ascii folding buffer to global * empty Token text as default
145 lines
3.7 KiB
Rust
145 lines
3.7 KiB
Rust
use std::borrow::Cow;
|
|
use std::mem;
|
|
|
|
use rust_stemmers::{self, Algorithm};
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
|
|
|
/// Available stemmer languages.
|
|
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
|
#[allow(missing_docs)]
|
|
pub enum Language {
|
|
Arabic,
|
|
Danish,
|
|
Dutch,
|
|
English,
|
|
Finnish,
|
|
French,
|
|
German,
|
|
Greek,
|
|
Hungarian,
|
|
Italian,
|
|
Norwegian,
|
|
Portuguese,
|
|
Romanian,
|
|
Russian,
|
|
Spanish,
|
|
Swedish,
|
|
Tamil,
|
|
Turkish,
|
|
}
|
|
|
|
impl Language {
|
|
fn algorithm(self) -> Algorithm {
|
|
use self::Language::*;
|
|
match self {
|
|
Arabic => Algorithm::Arabic,
|
|
Danish => Algorithm::Danish,
|
|
Dutch => Algorithm::Dutch,
|
|
English => Algorithm::English,
|
|
Finnish => Algorithm::Finnish,
|
|
French => Algorithm::French,
|
|
German => Algorithm::German,
|
|
Greek => Algorithm::Greek,
|
|
Hungarian => Algorithm::Hungarian,
|
|
Italian => Algorithm::Italian,
|
|
Norwegian => Algorithm::Norwegian,
|
|
Portuguese => Algorithm::Portuguese,
|
|
Romanian => Algorithm::Romanian,
|
|
Russian => Algorithm::Russian,
|
|
Spanish => Algorithm::Spanish,
|
|
Swedish => Algorithm::Swedish,
|
|
Tamil => Algorithm::Tamil,
|
|
Turkish => Algorithm::Turkish,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// `Stemmer` token filter. Several languages are supported, see [`Language`] for the available
|
|
/// languages.
|
|
/// Tokens are expected to be lowercased beforehand.
|
|
#[derive(Clone)]
|
|
pub struct Stemmer {
|
|
stemmer_algorithm: Algorithm,
|
|
}
|
|
|
|
impl Stemmer {
|
|
/// Creates a new `Stemmer` [`TokenFilter`] for a given language algorithm.
|
|
pub fn new(language: Language) -> Stemmer {
|
|
Stemmer {
|
|
stemmer_algorithm: language.algorithm(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for Stemmer {
|
|
/// Creates a new `Stemmer` [`TokenFilter`] for [`Language::English`].
|
|
fn default() -> Self {
|
|
Stemmer::new(Language::English)
|
|
}
|
|
}
|
|
|
|
impl TokenFilter for Stemmer {
|
|
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
|
|
|
|
fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
|
|
StemmerFilter {
|
|
stemmer_algorithm: self.stemmer_algorithm,
|
|
inner: tokenizer,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct StemmerFilter<T> {
|
|
stemmer_algorithm: Algorithm,
|
|
inner: T,
|
|
}
|
|
|
|
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
|
|
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
|
|
|
|
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
|
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
|
StemmerTokenStream {
|
|
tail: self.inner.token_stream(text),
|
|
stemmer,
|
|
buffer: String::new(),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct StemmerTokenStream<T> {
|
|
tail: T,
|
|
stemmer: rust_stemmers::Stemmer,
|
|
buffer: String,
|
|
}
|
|
|
|
impl<T: TokenStream> TokenStream for StemmerTokenStream<T> {
|
|
fn advance(&mut self) -> bool {
|
|
if !self.tail.advance() {
|
|
return false;
|
|
}
|
|
let token = self.tail.token_mut();
|
|
let stemmed_str = self.stemmer.stem(&token.text);
|
|
match stemmed_str {
|
|
Cow::Owned(stemmed_str) => token.text = stemmed_str,
|
|
Cow::Borrowed(stemmed_str) => {
|
|
self.buffer.clear();
|
|
self.buffer.push_str(stemmed_str);
|
|
mem::swap(&mut token.text, &mut self.buffer);
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
fn token(&self) -> &Token {
|
|
self.tail.token()
|
|
}
|
|
|
|
fn token_mut(&mut self) -> &mut Token {
|
|
self.tail.token_mut()
|
|
}
|
|
}
|