mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
Allow stemmers in languages other than English (#478)
Allow users to create stemmers for languages other than English. Add a default stemmer for English. Closes #478
This commit is contained in:
@@ -73,7 +73,7 @@
|
||||
//! let en_stem = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new());
|
||||
//! .filter(Stemmer::new(Language::English));
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
@@ -148,7 +148,7 @@ pub use self::ngram_tokenizer::NgramTokenizer;
|
||||
pub use self::raw_tokenizer::RawTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::stemmer::{Stemmer, Language};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub(crate) use self::tokenizer::box_tokenizer;
|
||||
@@ -159,8 +159,16 @@ pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use super::Token;
|
||||
use super::TokenizerManager;
|
||||
use super::{
|
||||
Token,
|
||||
TokenizerManager,
|
||||
SimpleTokenizer,
|
||||
Tokenizer,
|
||||
RemoveLongFilter,
|
||||
LowerCaser,
|
||||
Stemmer,
|
||||
Language
|
||||
};
|
||||
|
||||
/// This is a function that can be used in tests and doc tests
|
||||
/// to assert a token's correctness.
|
||||
@@ -214,6 +222,7 @@ pub mod tests {
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "hello", 0, 5);
|
||||
assert_token(&tokens[1], 1, "happi", 7, 12);
|
||||
@@ -221,6 +230,33 @@ pub mod tests {
|
||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"es_stem",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Spanish)),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("Hola, feliz contribuyente!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hola", 0, 4);
|
||||
assert_token(&tokens[1], 1, "feliz", 6, 11);
|
||||
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
|
||||
@@ -4,22 +4,77 @@ use super::{Token, TokenFilter, TokenStream};
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// `Stemmer` token filter. Currently only English is supported.
|
||||
/// Tokens are expected to be lowercased beforehands.
|
||||
/// Available stemmer languages.
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||
#[allow(missing_docs)]
|
||||
pub enum Language {
|
||||
Arabic,
|
||||
Danish,
|
||||
Dutch,
|
||||
English,
|
||||
Finnish,
|
||||
French,
|
||||
German,
|
||||
Hungarian,
|
||||
Italian,
|
||||
Portuguese,
|
||||
Romanian,
|
||||
Russian,
|
||||
Spanish,
|
||||
Swedish,
|
||||
Tamil,
|
||||
Turkish
|
||||
}
|
||||
|
||||
impl Language {
|
||||
fn algorithm(&self) -> Algorithm {
|
||||
use self::Language::*;
|
||||
|
||||
match self {
|
||||
Arabic => Algorithm::Arabic,
|
||||
Danish => Algorithm::Danish,
|
||||
Dutch => Algorithm::Dutch,
|
||||
English => Algorithm::English,
|
||||
Finnish => Algorithm::Finnish,
|
||||
French => Algorithm::French,
|
||||
German => Algorithm::German,
|
||||
Hungarian => Algorithm::Hungarian,
|
||||
Italian => Algorithm::Italian,
|
||||
Portuguese => Algorithm::Portuguese,
|
||||
Romanian => Algorithm::Romanian,
|
||||
Russian => Algorithm::Russian,
|
||||
Spanish => Algorithm::Spanish,
|
||||
Swedish => Algorithm::Swedish,
|
||||
Tamil => Algorithm::Tamil,
|
||||
Turkish => Algorithm::Turkish
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `Stemmer` token filter. Several languages are supported, see `Language` for the available
|
||||
/// languages.
|
||||
/// Tokens are expected to be lowercased beforehand.
|
||||
#[derive(Clone)]
|
||||
pub struct Stemmer {
|
||||
stemmer_algorithm: Arc<Algorithm>,
|
||||
}
|
||||
|
||||
impl Stemmer {
|
||||
/// Creates a new Stemmer `TokenFilter`.
|
||||
pub fn new() -> Stemmer {
|
||||
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
||||
pub fn new(language: Language) -> Stemmer {
|
||||
Stemmer {
|
||||
stemmer_algorithm: Arc::new(Algorithm::English),
|
||||
stemmer_algorithm: Arc::new(language.algorithm()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Stemmer {
|
||||
/// Creates a new Stemmer `TokenFilter` for English.
|
||||
fn default() -> Self {
|
||||
Stemmer::new(Language::English)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
|
||||
@@ -64,7 +64,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
|
||||
/// let en_stem = SimpleTokenizer
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::new());
|
||||
/// .filter(Stemmer::default());
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
|
||||
@@ -8,6 +8,7 @@ use tokenizer::RemoveLongFilter;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::Stemmer;
|
||||
use tokenizer::Tokenizer;
|
||||
use tokenizer::stemmer::Language;
|
||||
|
||||
/// The tokenizer manager serves as a store for
|
||||
/// all of the pre-configured tokenizer pipelines.
|
||||
@@ -71,7 +72,7 @@ impl Default for TokenizerManager {
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new()),
|
||||
.filter(Stemmer::new(Language::English)),
|
||||
);
|
||||
manager
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user