mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Revert "Fix non english stemmers (#521)"
This reverts commit 2cd31bcda2.
This commit is contained in:
@@ -228,27 +228,27 @@ pub mod tests {
|
||||
fn test_non_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
"es_stem",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Greek)),
|
||||
.filter(Stemmer::new(Language::Spanish)),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
||||
.token_stream("Hola, feliz contribuyente!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
|
||||
assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
|
||||
assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
|
||||
assert_token(&tokens[0], 0, "hola", 0, 4);
|
||||
assert_token(&tokens[1], 1, "feliz", 6, 11);
|
||||
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Available stemmer languages.
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||
@@ -56,14 +57,14 @@ impl Language {
|
||||
/// Tokens are expected to be lowercased beforehand.
|
||||
#[derive(Clone)]
|
||||
pub struct Stemmer {
|
||||
stemmer_algorithm: Algorithm,
|
||||
stemmer_algorithm: Arc<Algorithm>,
|
||||
}
|
||||
|
||||
impl Stemmer {
|
||||
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
||||
pub fn new(language: Language) -> Stemmer {
|
||||
Stemmer {
|
||||
stemmer_algorithm: language.algorithm(),
|
||||
stemmer_algorithm: Arc::new(language.algorithm()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -82,7 +83,7 @@ where
|
||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
|
||||
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user