mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2b28e491c2 | ||
|
|
1d4fa4547c |
@@ -1,3 +1,8 @@
|
|||||||
|
Tantivy 0.9.1
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Hotfix: The english stemmer was actually used for all languages.
|
||||||
|
|
||||||
Tantivy 0.9.0
|
Tantivy 0.9.0
|
||||||
=====================
|
=====================
|
||||||
*0.9.0 index format is not compatible with the
|
*0.9.0 index format is not compatible with the
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.9.0"
|
version = "0.9.1"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
|
|||||||
@@ -228,27 +228,27 @@ pub mod tests {
|
|||||||
fn test_non_en_tokenizer() {
|
fn test_non_en_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"es_stem",
|
"el_stem",
|
||||||
SimpleTokenizer
|
SimpleTokenizer
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new(Language::Spanish)),
|
.filter(Stemmer::new(Language::Greek)),
|
||||||
);
|
);
|
||||||
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
|
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||||
let mut tokens: Vec<Token> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
tokens.push(token.clone());
|
tokens.push(token.clone());
|
||||||
};
|
};
|
||||||
en_tokenizer
|
en_tokenizer
|
||||||
.token_stream("Hola, feliz contribuyente!")
|
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(tokens.len(), 3);
|
assert_eq!(tokens.len(), 3);
|
||||||
assert_token(&tokens[0], 0, "hola", 0, 4);
|
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
|
||||||
assert_token(&tokens[1], 1, "feliz", 6, 11);
|
assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
|
||||||
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
|
assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
use rust_stemmers::{self, Algorithm};
|
use rust_stemmers::{self, Algorithm};
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
/// Available stemmer languages.
|
/// Available stemmer languages.
|
||||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||||
@@ -57,14 +56,14 @@ impl Language {
|
|||||||
/// Tokens are expected to be lowercased beforehand.
|
/// Tokens are expected to be lowercased beforehand.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Stemmer {
|
pub struct Stemmer {
|
||||||
stemmer_algorithm: Arc<Algorithm>,
|
stemmer_algorithm: Algorithm,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Stemmer {
|
impl Stemmer {
|
||||||
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
||||||
pub fn new(language: Language) -> Stemmer {
|
pub fn new(language: Language) -> Stemmer {
|
||||||
Stemmer {
|
Stemmer {
|
||||||
stemmer_algorithm: Arc::new(language.algorithm()),
|
stemmer_algorithm: language.algorithm(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -83,7 +82,7 @@ where
|
|||||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||||
|
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||||
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
|
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||||
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user