mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-26 20:19:57 +00:00
Allow stemmers in languages other than English (#478)
Allow users to create stemmers for languages other than English. Add a default stemmer for English. Closes #478
This commit is contained in:
@@ -73,7 +73,7 @@
|
||||
//! let en_stem = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new());
|
||||
//! .filter(Stemmer::new(Language::English));
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
@@ -148,7 +148,7 @@ pub use self::ngram_tokenizer::NgramTokenizer;
|
||||
pub use self::raw_tokenizer::RawTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::stemmer::{Stemmer, Language};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub(crate) use self::tokenizer::box_tokenizer;
|
||||
@@ -159,8 +159,16 @@ pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use super::Token;
|
||||
use super::TokenizerManager;
|
||||
use super::{
|
||||
Token,
|
||||
TokenizerManager,
|
||||
SimpleTokenizer,
|
||||
Tokenizer,
|
||||
RemoveLongFilter,
|
||||
LowerCaser,
|
||||
Stemmer,
|
||||
Language
|
||||
};
|
||||
|
||||
/// This is a function that can be used in tests and doc tests
|
||||
/// to assert a token's correctness.
|
||||
@@ -214,6 +222,7 @@ pub mod tests {
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "hello", 0, 5);
|
||||
assert_token(&tokens[1], 1, "happi", 7, 12);
|
||||
@@ -221,6 +230,33 @@ pub mod tests {
|
||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"es_stem",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Spanish)),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("Hola, feliz contribuyente!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hola", 0, 4);
|
||||
assert_token(&tokens[1], 1, "feliz", 6, 11);
|
||||
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
|
||||
Reference in New Issue
Block a user