From 1d4fa4547cda4be403e1237e013560800a5ff8ee Mon Sep 17 00:00:00 2001 From: Panagiotis Ktistakis Date: Wed, 27 Mar 2019 01:54:16 +0200 Subject: [PATCH] Fix non english stemmers (#521) --- src/tokenizer/mod.rs | 14 +++++++------- src/tokenizer/stemmer.rs | 7 +++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 8ef2f8be0..e07116fc9 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -228,27 +228,27 @@ pub mod tests { fn test_non_en_tokenizer() { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( - "es_stem", + "el_stem", SimpleTokenizer .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) - .filter(Stemmer::new(Language::Spanish)), + .filter(Stemmer::new(Language::Greek)), ); - let en_tokenizer = tokenizer_manager.get("es_stem").unwrap(); + let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.clone()); }; en_tokenizer - .token_stream("Hola, feliz contribuyente!") + .token_stream("Καλημέρα, χαρούμενε φορολογούμενε!") .process(&mut add_token); } assert_eq!(tokens.len(), 3); - assert_token(&tokens[0], 0, "hola", 0, 4); - assert_token(&tokens[1], 1, "feliz", 6, 11); - assert_token(&tokens[2], 2, "contribuyent", 12, 25); + assert_token(&tokens[0], 0, "καλημερ", 0, 16); + assert_token(&tokens[1], 1, "χαρουμεν", 18, 36); + assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63); } #[test] diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 33c45cab4..f9cacce1a 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -2,7 +2,6 @@ use super::{Token, TokenFilter, TokenStream}; use rust_stemmers::{self, Algorithm}; -use std::sync::Arc; /// Available stemmer languages. #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] @@ -57,14 +56,14 @@ impl Language { /// Tokens are expected to be lowercased beforehand. #[derive(Clone)] pub struct Stemmer { - stemmer_algorithm: Arc, + stemmer_algorithm: Algorithm, } impl Stemmer { /// Creates a new Stemmer `TokenFilter` for a given language algorithm. pub fn new(language: Language) -> Stemmer { Stemmer { - stemmer_algorithm: Arc::new(language.algorithm()), + stemmer_algorithm: language.algorithm(), } } } @@ -83,7 +82,7 @@ where type ResultTokenStream = StemmerTokenStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English); + let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); StemmerTokenStream::wrap(inner_stemmer, token_stream) } }