Simple Implementation of NGram Tokenizer (#278)

* Simple Implementation of NGram Tokenizer

It does not yet support edges
It could probably be better in many "rusty" ways
But the test is passing, so I'll call this a good stopping point for
the day.

* Remove Ngram from manager. Too many variations

* Basic configuration model

Should the extensive tests exist here?

* Add Sample to provide an End to End testing

* Basic Edgegram support

* cleanup

* code feedback

* More code review feedback processed
This commit is contained in:
Dru Sellers
2018-05-06 11:47:49 -05:00
committed by Paul Masurel
parent 68ee18e4e8
commit ca74c14647
5 changed files with 521 additions and 21 deletions

View File

@@ -132,6 +132,7 @@ mod alphanum_only;
mod facet_tokenizer;
mod japanese_tokenizer;
mod lower_caser;
mod ngram_tokenizer;
mod raw_tokenizer;
mod remove_long;
mod simple_tokenizer;
@@ -144,6 +145,7 @@ pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::facet_tokenizer::FacetTokenizer;
pub use self::japanese_tokenizer::JapaneseTokenizer;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer;
@@ -153,8 +155,32 @@ pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::tokenizer_manager::TokenizerManager;
/// This is a function that can be used in tests and doc tests
/// to assert a token's correctness.
/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
/// public api?
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
assert_eq!(
token.position, position,
"expected position {} but {:?}",
position, token
);
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
assert_eq!(
token.offset_from, from,
"expected offset_from {} but {:?}",
from, token
);
assert_eq!(
token.offset_to, to,
"expected offset_to {} but {:?}",
to, token
);
}
#[cfg(test)]
mod test {
pub mod test {
use super::assert_token;
use super::Token;
use super::TokenizerManager;
@@ -162,17 +188,17 @@ mod test {
fn test_raw_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<String> = vec![];
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.text.clone());
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);
assert_eq!(&tokens[0], "Hello, happy tax payer!");
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
}
#[test]
@@ -180,20 +206,20 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<String> = vec![];
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.text.clone());
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_eq!(&tokens[0], "hello");
assert_eq!(&tokens[1], "happi");
assert_eq!(&tokens[2], "tax");
assert_eq!(&tokens[3], "payer");
assert_token(&tokens[0], 0, "hello", 0, 5);
assert_token(&tokens[1], 1, "happi", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer", 17, 22);
}
#[test]
@@ -201,21 +227,87 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("ja").unwrap();
let mut tokens: Vec<String> = vec![];
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.text.clone());
tokens.push(token.clone());
};
en_tokenizer
.token_stream("野菜食べないとやばい!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 5);
assert_eq!(&tokens[0], "野菜");
assert_eq!(&tokens[1], "食べ");
assert_eq!(&tokens[2], "ない");
assert_eq!(&tokens[3], "");
assert_eq!(&tokens[4], "やばい");
assert_token(&tokens[0], 0, "野菜", 0, 6);
assert_token(&tokens[1], 1, "食べ", 6, 12);
assert_token(&tokens[2], 2, "ない", 12, 18);
assert_token(&tokens[3], 3, "", 18, 21);
assert_token(&tokens[4], 4, "やばい", 21, 30);
}
#[test]
fn test_ngram_tokenizer() {
use super::{LowerCaser, NgramTokenizer};
use tokenizer::tokenizer::TokenStream;
use tokenizer::tokenizer::Tokenizer;
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
tokenizer_manager.register(
"ngram3",
NgramTokenizer::new(3, 3, false).filter(LowerCaser),
);
tokenizer_manager.register(
"edgegram5",
NgramTokenizer::new(2, 5, true).filter(LowerCaser),
);
let tokenizer = NgramTokenizer::new(1, 2, false);
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
tokenizer.token_stream("hello").process(&mut add_token);
}
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
assert_token(&tokens[2], 1, "e", 1, 2);
assert_token(&tokens[3], 1, "el", 1, 3);
assert_token(&tokens[4], 2, "l", 2, 3);
assert_token(&tokens[5], 2, "ll", 2, 4);
assert_token(&tokens[6], 3, "l", 3, 4);
assert_token(&tokens[7], 3, "lo", 3, 5);
assert_token(&tokens[8], 4, "o", 4, 5);
let tokenizer = tokenizer_manager.get("ngram3").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
tokenizer.token_stream("Hello").process(&mut add_token);
}
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 1, "ell", 1, 4);
assert_token(&tokens[2], 2, "llo", 2, 5);
let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
tokenizer
.token_stream("Frankenstein")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
assert_token(&tokens[2], 0, "fran", 0, 4);
assert_token(&tokens[3], 0, "frank", 0, 5);
}
#[test]
@@ -223,20 +315,20 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<String> = vec![];
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.text.clone());
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
{
let mut tokens: Vec<String> = vec![];
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.text.clone());
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}

View File

@@ -0,0 +1,157 @@
use super::{Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
/// With this tokenizer, the `position` field expresses the starting offset of the ngram
/// rather than the `token` offset.
///
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
///
/// | Term | he | hel | el | ell | ll | llo | lo |
/// |----------|-----|-----|-----|-----|-----|-----|----|
/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 |
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
///
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
///
/// | Term | he | hel | hell | hello |
/// |----------|-----|-----|-------|-------|
/// | Position | 0 | 0 | 0 | 0 |
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
///
/// # Example
///
/// ```
/// extern crate tantivy;
/// use tantivy::tokenizer::*;
/// use tantivy::tokenizer::assert_token;
///
/// # fn main() {
/// let tokenizer = NgramTokenizer::new(2, 3, false);
/// let mut stream = tokenizer.token_stream("hello");
///
/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
/// assert!(stream.next().is_none());
/// # }
/// ```
#[derive(Clone)]
pub struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,
/// max size of the n-gram
max_gram: usize,
/// if true, will only parse the leading edge of the input
prefix_only: bool,
}
impl NgramTokenizer {
/// Configures a new Ngram tokenizer
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0");
assert!(
min_gram <= max_gram,
"min_gram must not be greater than max_gram"
);
NgramTokenizer {
min_gram,
max_gram,
prefix_only,
}
}
}
pub struct NgramTokenStream<'a> {
text: &'a str,
location: usize,
text_length: usize,
token: Token,
min_gram: usize,
max_gram: usize,
gram_size: usize,
prefix_only: bool,
}
impl<'a> Tokenizer<'a> for NgramTokenizer {
type TokenStreamImpl = NgramTokenStream<'a>;
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
NgramTokenStream {
text,
location: 0,
text_length: text.len(),
token: Token::default(),
min_gram: self.min_gram,
max_gram: self.max_gram,
prefix_only: self.prefix_only,
gram_size: self.min_gram,
}
}
}
impl<'a> NgramTokenStream<'a> {
/// Get the next set of token options
/// cycle through 1,2 (min..=max)
/// returning None if processing should stop
fn chomp(&mut self) -> Option<(usize, usize)> {
// Have we exceeded the bounds of the text we are indexing?
if self.gram_size > self.max_gram {
if self.prefix_only {
return None;
}
// since we aren't just processing edges
// we need to reset the gram size
self.gram_size = self.min_gram;
// and move down the chain of letters
self.location += 1;
}
let result = if (self.location + self.gram_size) <= self.text_length {
Some((self.location, self.gram_size))
} else {
None
};
// increase the gram size for the next pass
self.gram_size += 1;
result
}
}
impl<'a> TokenStream for NgramTokenStream<'a> {
fn advance(&mut self) -> bool {
// clear out working token text
self.token.text.clear();
if let Some((position, size)) = self.chomp() {
self.token.position = position;
let offset_from = position;
let offset_to = offset_from + size;
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}

View File

@@ -4,6 +4,7 @@ use std::borrow::{Borrow, BorrowMut};
use tokenizer::TokenStreamChain;
/// Token
#[derive(Debug, Clone)]
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
@@ -260,3 +261,24 @@ pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
/// Wraps a token stream and returns the modified one.
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
}
#[cfg(test)]
mod test {
use super::Token;
#[test]
fn clone() {
let t1 = Token {
position: 1,
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
};
let t2 = t1.clone();
assert_eq!(t1.position, t2.position);
assert_eq!(t1.offset_from, t2.offset_from);
assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text);
}
}