diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index d5a804145..bef61daaf 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -73,7 +73,7 @@ //! let en_stem = SimpleTokenizer //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser) -//! .filter(Stemmer::new()); +//! .filter(Stemmer::new(Language::English)); //! # } //! ``` //! @@ -148,7 +148,7 @@ pub use self::ngram_tokenizer::NgramTokenizer; pub use self::raw_tokenizer::RawTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::simple_tokenizer::SimpleTokenizer; -pub use self::stemmer::Stemmer; +pub use self::stemmer::{Stemmer, Language}; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; pub(crate) use self::tokenizer::box_tokenizer; @@ -159,8 +159,16 @@ pub use self::tokenizer_manager::TokenizerManager; #[cfg(test)] pub mod tests { - use super::Token; - use super::TokenizerManager; + use super::{ + Token, + TokenizerManager, + SimpleTokenizer, + Tokenizer, + RemoveLongFilter, + LowerCaser, + Stemmer, + Language + }; /// This is a function that can be used in tests and doc tests /// to assert a token's correctness. @@ -214,6 +222,7 @@ pub mod tests { .token_stream("Hello, happy tax payer!") .process(&mut add_token); } + assert_eq!(tokens.len(), 4); assert_token(&tokens[0], 0, "hello", 0, 5); assert_token(&tokens[1], 1, "happi", 7, 12); @@ -221,6 +230,33 @@ pub mod tests { assert_token(&tokens[3], 3, "payer", 17, 22); } + #[test] + fn test_non_en_tokenizer() { + let tokenizer_manager = TokenizerManager::default(); + tokenizer_manager.register( + "es_stem", + SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(Stemmer::new(Language::Spanish)), + ); + let en_tokenizer = tokenizer_manager.get("es_stem").unwrap(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { + tokens.push(token.clone()); + }; + en_tokenizer + .token_stream("Hola, feliz contribuyente!") + .process(&mut add_token); + } + + assert_eq!(tokens.len(), 3); + assert_token(&tokens[0], 0, "hola", 0, 4); + assert_token(&tokens[1], 1, "feliz", 6, 11); + assert_token(&tokens[2], 2, "contribuyent", 12, 25); + } + #[test] fn test_tokenizer_empty() { let tokenizer_manager = TokenizerManager::default(); diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 064662889..19980a59d 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -4,22 +4,77 @@ use super::{Token, TokenFilter, TokenStream}; use rust_stemmers::{self, Algorithm}; use std::sync::Arc; -/// `Stemmer` token filter. Currently only English is supported. -/// Tokens are expected to be lowercased beforehands. +/// Available stemmer languages. +#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] +#[allow(missing_docs)] +pub enum Language { + Arabic, + Danish, + Dutch, + English, + Finnish, + French, + German, + Hungarian, + Italian, + Portuguese, + Romanian, + Russian, + Spanish, + Swedish, + Tamil, + Turkish +} + +impl Language { + fn algorithm(&self) -> Algorithm { + use self::Language::*; + + match self { + Arabic => Algorithm::Arabic, + Danish => Algorithm::Danish, + Dutch => Algorithm::Dutch, + English => Algorithm::English, + Finnish => Algorithm::Finnish, + French => Algorithm::French, + German => Algorithm::German, + Hungarian => Algorithm::Hungarian, + Italian => Algorithm::Italian, + Portuguese => Algorithm::Portuguese, + Romanian => Algorithm::Romanian, + Russian => Algorithm::Russian, + Spanish => Algorithm::Spanish, + Swedish => Algorithm::Swedish, + Tamil => Algorithm::Tamil, + Turkish => Algorithm::Turkish + } + } +} + +/// `Stemmer` token filter. Several languages are supported, see `Language` for the available +/// languages. +/// Tokens are expected to be lowercased beforehand. #[derive(Clone)] pub struct Stemmer { stemmer_algorithm: Arc, } impl Stemmer { - /// Creates a new Stemmer `TokenFilter`. - pub fn new() -> Stemmer { + /// Creates a new Stemmer `TokenFilter` for a given language algorithm. + pub fn new(language: Language) -> Stemmer { Stemmer { - stemmer_algorithm: Arc::new(Algorithm::English), + stemmer_algorithm: Arc::new(language.algorithm()), } } } +impl Default for Stemmer { + /// Creates a new Stemmer `TokenFilter` for English. + fn default() -> Self { + Stemmer::new(Language::English) + } +} + impl TokenFilter for Stemmer where TailTokenStream: TokenStream, diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index d73f84e93..46808d07e 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -64,7 +64,7 @@ pub trait Tokenizer<'a>: Sized + Clone { /// let en_stem = SimpleTokenizer /// .filter(RemoveLongFilter::limit(40)) /// .filter(LowerCaser) - /// .filter(Stemmer::new()); + /// .filter(Stemmer::default()); /// # } /// ``` /// diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 001469f35..37115e976 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -8,6 +8,7 @@ use tokenizer::RemoveLongFilter; use tokenizer::SimpleTokenizer; use tokenizer::Stemmer; use tokenizer::Tokenizer; +use tokenizer::stemmer::Language; /// The tokenizer manager serves as a store for /// all of the pre-configured tokenizer pipelines. @@ -71,7 +72,7 @@ impl Default for TokenizerManager { SimpleTokenizer .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) - .filter(Stemmer::new()), + .filter(Stemmer::new(Language::English)), ); manager }