Compare commits

...

2 Commits
float ... 0.9.1

Author SHA1 Message Date
Paul Masurel
2b28e491c2 Preparing for hotfix release 0.9.1 2019-03-28 09:58:33 +09:00
Panagiotis Ktistakis
1d4fa4547c Fix non english stemmers (#521) 2019-03-28 09:50:27 +09:00
4 changed files with 16 additions and 12 deletions

View File

@@ -1,3 +1,8 @@
Tantivy 0.9.1
=====================
Hotfix: The english stemmer was actually used for all languages.
Tantivy 0.9.0 Tantivy 0.9.0
===================== =====================
*0.9.0 index format is not compatible with the *0.9.0 index format is not compatible with the

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.9.0" version = "0.9.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]

View File

@@ -228,27 +228,27 @@ pub mod tests {
fn test_non_en_tokenizer() { fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register( tokenizer_manager.register(
"es_stem", "el_stem",
SimpleTokenizer SimpleTokenizer
.filter(RemoveLongFilter::limit(40)) .filter(RemoveLongFilter::limit(40))
.filter(LowerCaser) .filter(LowerCaser)
.filter(Stemmer::new(Language::Spanish)), .filter(Stemmer::new(Language::Greek)),
); );
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap(); let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];
{ {
let mut add_token = |token: &Token| { let mut add_token = |token: &Token| {
tokens.push(token.clone()); tokens.push(token.clone());
}; };
en_tokenizer en_tokenizer
.token_stream("Hola, feliz contribuyente!") .token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
.process(&mut add_token); .process(&mut add_token);
} }
assert_eq!(tokens.len(), 3); assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hola", 0, 4); assert_token(&tokens[0], 0, "καλημερ", 0, 16);
assert_token(&tokens[1], 1, "feliz", 6, 11); assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
assert_token(&tokens[2], 2, "contribuyent", 12, 25); assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
} }
#[test] #[test]

View File

@@ -2,7 +2,6 @@
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use rust_stemmers::{self, Algorithm}; use rust_stemmers::{self, Algorithm};
use std::sync::Arc;
/// Available stemmer languages. /// Available stemmer languages.
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
@@ -57,14 +56,14 @@ impl Language {
/// Tokens are expected to be lowercased beforehand. /// Tokens are expected to be lowercased beforehand.
#[derive(Clone)] #[derive(Clone)]
pub struct Stemmer { pub struct Stemmer {
stemmer_algorithm: Arc<Algorithm>, stemmer_algorithm: Algorithm,
} }
impl Stemmer { impl Stemmer {
/// Creates a new Stemmer `TokenFilter` for a given language algorithm. /// Creates a new Stemmer `TokenFilter` for a given language algorithm.
pub fn new(language: Language) -> Stemmer { pub fn new(language: Language) -> Stemmer {
Stemmer { Stemmer {
stemmer_algorithm: Arc::new(language.algorithm()), stemmer_algorithm: language.algorithm(),
} }
} }
} }
@@ -83,7 +82,7 @@ where
type ResultTokenStream = StemmerTokenStream<TailTokenStream>; type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English); let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
StemmerTokenStream::wrap(inner_stemmer, token_stream) StemmerTokenStream::wrap(inner_stemmer, token_stream)
} }
} }