From b3a201e6655c2a2589ed1e348bf8b11a9be08195 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 27 Mar 2019 08:54:55 +0900 Subject: [PATCH] Revert "Fix non english stemmers (#521)" This reverts commit 2cd31bcda2aa8cb3a2db47ee43ecc7a283e014b8. --- src/tokenizer/mod.rs | 14 +++++++------- src/tokenizer/stemmer.rs | 7 ++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index e07116fc9..8ef2f8be0 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -228,27 +228,27 @@ pub mod tests { fn test_non_en_tokenizer() { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( - "el_stem", + "es_stem", SimpleTokenizer .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) - .filter(Stemmer::new(Language::Greek)), + .filter(Stemmer::new(Language::Spanish)), ); - let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); + let en_tokenizer = tokenizer_manager.get("es_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.clone()); }; en_tokenizer - .token_stream("Καλημέρα, χαρούμενε φορολογούμενε!") + .token_stream("Hola, feliz contribuyente!") .process(&mut add_token); } assert_eq!(tokens.len(), 3); - assert_token(&tokens[0], 0, "καλημερ", 0, 16); - assert_token(&tokens[1], 1, "χαρουμεν", 18, 36); - assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63); + assert_token(&tokens[0], 0, "hola", 0, 4); + assert_token(&tokens[1], 1, "feliz", 6, 11); + assert_token(&tokens[2], 2, "contribuyent", 12, 25); } #[test] diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index f9cacce1a..33c45cab4 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -2,6 +2,7 @@ use super::{Token, TokenFilter, TokenStream}; use rust_stemmers::{self, Algorithm}; +use std::sync::Arc; /// Available stemmer languages. #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] @@ -56,14 +57,14 @@ impl Language { /// Tokens are expected to be lowercased beforehand. #[derive(Clone)] pub struct Stemmer { - stemmer_algorithm: Algorithm, + stemmer_algorithm: Arc, } impl Stemmer { /// Creates a new Stemmer `TokenFilter` for a given language algorithm. pub fn new(language: Language) -> Stemmer { Stemmer { - stemmer_algorithm: language.algorithm(), + stemmer_algorithm: Arc::new(language.algorithm()), } } } @@ -82,7 +83,7 @@ where type ResultTokenStream = StemmerTokenStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); + let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English); StemmerTokenStream::wrap(inner_stemmer, token_stream) } }