Allow stemmers in languages other than English (#478)

Allow users to create stemmers for languages other than English. Add a
default stemmer for English.

Closes #478
This commit is contained in:
Paul Masurel
2019-01-23 22:21:00 +09:00
committed by GitHub
parent 74f70a5c2c
commit 0b0bf59a32
4 changed files with 103 additions and 11 deletions

View File

@@ -73,7 +73,7 @@
//! let en_stem = SimpleTokenizer
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(Stemmer::new());
//! .filter(Stemmer::new(Language::English));
//! # }
//! ```
//!
@@ -148,7 +148,7 @@ pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::Stemmer;
pub use self::stemmer::{Stemmer, Language};
pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub(crate) use self::tokenizer::box_tokenizer;
@@ -159,8 +159,16 @@ pub use self::tokenizer_manager::TokenizerManager;
#[cfg(test)]
pub mod tests {
use super::Token;
use super::TokenizerManager;
use super::{
Token,
TokenizerManager,
SimpleTokenizer,
Tokenizer,
RemoveLongFilter,
LowerCaser,
Stemmer,
Language
};
/// This is a function that can be used in tests and doc tests
/// to assert a token's correctness.
@@ -214,6 +222,7 @@ pub mod tests {
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "hello", 0, 5);
assert_token(&tokens[1], 1, "happi", 7, 12);
@@ -221,6 +230,33 @@ pub mod tests {
assert_token(&tokens[3], 3, "payer", 17, 22);
}
#[test]
fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"es_stem",
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::Spanish)),
);
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hola, feliz contribuyente!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hola", 0, 4);
assert_token(&tokens[1], 1, "feliz", 6, 11);
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
}
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default();

View File

@@ -4,22 +4,77 @@ use super::{Token, TokenFilter, TokenStream};
use rust_stemmers::{self, Algorithm};
use std::sync::Arc;
/// `Stemmer` token filter. Currently only English is supported.
/// Tokens are expected to be lowercased beforehands.
/// Available stemmer languages.
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
#[allow(missing_docs)]
pub enum Language {
Arabic,
Danish,
Dutch,
English,
Finnish,
French,
German,
Hungarian,
Italian,
Portuguese,
Romanian,
Russian,
Spanish,
Swedish,
Tamil,
Turkish
}
impl Language {
fn algorithm(&self) -> Algorithm {
use self::Language::*;
match self {
Arabic => Algorithm::Arabic,
Danish => Algorithm::Danish,
Dutch => Algorithm::Dutch,
English => Algorithm::English,
Finnish => Algorithm::Finnish,
French => Algorithm::French,
German => Algorithm::German,
Hungarian => Algorithm::Hungarian,
Italian => Algorithm::Italian,
Portuguese => Algorithm::Portuguese,
Romanian => Algorithm::Romanian,
Russian => Algorithm::Russian,
Spanish => Algorithm::Spanish,
Swedish => Algorithm::Swedish,
Tamil => Algorithm::Tamil,
Turkish => Algorithm::Turkish
}
}
}
/// `Stemmer` token filter. Several languages are supported, see `Language` for the available
/// languages.
/// Tokens are expected to be lowercased beforehand.
#[derive(Clone)]
pub struct Stemmer {
stemmer_algorithm: Arc<Algorithm>,
}
impl Stemmer {
/// Creates a new Stemmer `TokenFilter`.
pub fn new() -> Stemmer {
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
pub fn new(language: Language) -> Stemmer {
Stemmer {
stemmer_algorithm: Arc::new(Algorithm::English),
stemmer_algorithm: Arc::new(language.algorithm()),
}
}
}
impl Default for Stemmer {
/// Creates a new Stemmer `TokenFilter` for English.
fn default() -> Self {
Stemmer::new(Language::English)
}
}
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
where
TailTokenStream: TokenStream,

View File

@@ -64,7 +64,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
/// let en_stem = SimpleTokenizer
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser)
/// .filter(Stemmer::new());
/// .filter(Stemmer::default());
/// # }
/// ```
///

View File

@@ -8,6 +8,7 @@ use tokenizer::RemoveLongFilter;
use tokenizer::SimpleTokenizer;
use tokenizer::Stemmer;
use tokenizer::Tokenizer;
use tokenizer::stemmer::Language;
/// The tokenizer manager serves as a store for
/// all of the pre-configured tokenizer pipelines.
@@ -71,7 +72,7 @@ impl Default for TokenizerManager {
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new()),
.filter(Stemmer::new(Language::English)),
);
manager
}