mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
128 lines
3.3 KiB
Rust
128 lines
3.3 KiB
Rust
use std::borrow::Cow;
|
|
use std::mem;
|
|
|
|
use rust_stemmers::{self, Algorithm};
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use super::{Token, TokenFilter, TokenStream};
|
|
|
|
/// Available stemmer languages.
|
|
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
|
#[allow(missing_docs)]
|
|
pub enum Language {
|
|
Arabic,
|
|
Danish,
|
|
Dutch,
|
|
English,
|
|
Finnish,
|
|
French,
|
|
German,
|
|
Greek,
|
|
Hungarian,
|
|
Italian,
|
|
Norwegian,
|
|
Portuguese,
|
|
Romanian,
|
|
Russian,
|
|
Spanish,
|
|
Swedish,
|
|
Tamil,
|
|
Turkish,
|
|
}
|
|
|
|
impl Language {
|
|
fn algorithm(self) -> Algorithm {
|
|
use self::Language::*;
|
|
match self {
|
|
Arabic => Algorithm::Arabic,
|
|
Danish => Algorithm::Danish,
|
|
Dutch => Algorithm::Dutch,
|
|
English => Algorithm::English,
|
|
Finnish => Algorithm::Finnish,
|
|
French => Algorithm::French,
|
|
German => Algorithm::German,
|
|
Greek => Algorithm::Greek,
|
|
Hungarian => Algorithm::Hungarian,
|
|
Italian => Algorithm::Italian,
|
|
Norwegian => Algorithm::Norwegian,
|
|
Portuguese => Algorithm::Portuguese,
|
|
Romanian => Algorithm::Romanian,
|
|
Russian => Algorithm::Russian,
|
|
Spanish => Algorithm::Spanish,
|
|
Swedish => Algorithm::Swedish,
|
|
Tamil => Algorithm::Tamil,
|
|
Turkish => Algorithm::Turkish,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// `Stemmer` token filter. Several languages are supported, see [`Language`] for the available
|
|
/// languages.
|
|
/// Tokens are expected to be lowercased beforehand.
|
|
#[derive(Clone)]
|
|
pub struct Stemmer {
|
|
stemmer_algorithm: Algorithm,
|
|
}
|
|
|
|
impl Stemmer {
|
|
/// Creates a new `Stemmer` [`TokenFilter`] for a given language algorithm.
|
|
pub fn new(language: Language) -> Stemmer {
|
|
Stemmer {
|
|
stemmer_algorithm: language.algorithm(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for Stemmer {
|
|
/// Creates a new `Stemmer` [`TokenFilter`] for [`Language::English`].
|
|
fn default() -> Self {
|
|
Stemmer::new(Language::English)
|
|
}
|
|
}
|
|
|
|
impl TokenFilter for Stemmer {
|
|
type OutputTokenStream<T: TokenStream> = StemmerTokenStream<T>;
|
|
|
|
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
|
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
|
StemmerTokenStream {
|
|
tail: token_stream,
|
|
stemmer,
|
|
buffer: String::new(),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct StemmerTokenStream<T> {
|
|
tail: T,
|
|
stemmer: rust_stemmers::Stemmer,
|
|
buffer: String,
|
|
}
|
|
|
|
impl<T: TokenStream> TokenStream for StemmerTokenStream<T> {
|
|
fn advance(&mut self) -> bool {
|
|
if !self.tail.advance() {
|
|
return false;
|
|
}
|
|
let token = self.tail.token_mut();
|
|
let stemmed_str = self.stemmer.stem(&token.text);
|
|
match stemmed_str {
|
|
Cow::Owned(stemmed_str) => token.text = stemmed_str,
|
|
Cow::Borrowed(stemmed_str) => {
|
|
self.buffer.clear();
|
|
self.buffer.push_str(stemmed_str);
|
|
mem::swap(&mut token.text, &mut self.buffer);
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
fn token(&self) -> &Token {
|
|
self.tail.token()
|
|
}
|
|
|
|
fn token_mut(&mut self) -> &mut Token {
|
|
self.tail.token_mut()
|
|
}
|
|
}
|