mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
Fix NGram processing of non-ascii characters (#430)
* A working version * optimize the ngram parsing * Decoding codepoint only once. * Closes #429 * using leading_zeros to make code less cryptic * lookup in a table
This commit is contained in:
committed by
Paul Masurel
parent
63b9d62237
commit
e75bb1d6a1
@@ -157,35 +157,34 @@ pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
/// This is a function that can be used in tests and doc tests
|
||||
/// to assert a token's correctness.
|
||||
/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
|
||||
/// public api?
|
||||
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
||||
assert_eq!(
|
||||
token.position, position,
|
||||
"expected position {} but {:?}",
|
||||
position, token
|
||||
);
|
||||
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
||||
assert_eq!(
|
||||
token.offset_from, from,
|
||||
"expected offset_from {} but {:?}",
|
||||
from, token
|
||||
);
|
||||
assert_eq!(
|
||||
token.offset_to, to,
|
||||
"expected offset_to {} but {:?}",
|
||||
to, token
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
use super::assert_token;
|
||||
pub mod tests {
|
||||
use super::Token;
|
||||
use super::TokenizerManager;
|
||||
|
||||
|
||||
/// This is a function that can be used in tests and doc tests
|
||||
/// to assert a token's correctness.
|
||||
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
||||
assert_eq!(
|
||||
token.position, position,
|
||||
"expected position {} but {:?}",
|
||||
position, token
|
||||
);
|
||||
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
||||
assert_eq!(
|
||||
token.offset_from, from,
|
||||
"expected offset_from {} but {:?}",
|
||||
from, token
|
||||
);
|
||||
assert_eq!(
|
||||
token.offset_to, to,
|
||||
"expected offset_to {} but {:?}",
|
||||
to, token
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
@@ -224,72 +223,6 @@ pub mod test {
|
||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer() {
|
||||
use super::{LowerCaser, NgramTokenizer};
|
||||
use tokenizer::tokenizer::TokenStream;
|
||||
use tokenizer::tokenizer::Tokenizer;
|
||||
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
|
||||
tokenizer_manager.register(
|
||||
"ngram3",
|
||||
NgramTokenizer::new(3, 3, false).filter(LowerCaser),
|
||||
);
|
||||
tokenizer_manager.register(
|
||||
"edgegram5",
|
||||
NgramTokenizer::new(2, 5, true).filter(LowerCaser),
|
||||
);
|
||||
|
||||
let tokenizer = NgramTokenizer::new(1, 2, false);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
tokenizer.token_stream("hello").process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||
assert_token(&tokens[2], 1, "e", 1, 2);
|
||||
assert_token(&tokens[3], 1, "el", 1, 3);
|
||||
assert_token(&tokens[4], 2, "l", 2, 3);
|
||||
assert_token(&tokens[5], 2, "ll", 2, 4);
|
||||
assert_token(&tokens[6], 3, "l", 3, 4);
|
||||
assert_token(&tokens[7], 3, "lo", 3, 5);
|
||||
assert_token(&tokens[8], 4, "o", 4, 5);
|
||||
|
||||
let tokenizer = tokenizer_manager.get("ngram3").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
tokenizer.token_stream("Hello").process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||
assert_token(&tokens[1], 1, "ell", 1, 4);
|
||||
assert_token(&tokens[2], 2, "llo", 2, 5);
|
||||
|
||||
let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
tokenizer
|
||||
.token_stream("Frankenstein")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||
assert_token(&tokens[2], 0, "fran", 0, 4);
|
||||
assert_token(&tokens[3], 0, "frank", 0, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
|
||||
Reference in New Issue
Block a user