From e75bb1d6a13f22d559072b682f9fcfac0e1ef965 Mon Sep 17 00:00:00 2001 From: Dru Sellers Date: Tue, 30 Oct 2018 18:35:27 -0500 Subject: [PATCH] Fix NGram processing of non-ascii characters (#430) * A working version * optimize the ngram parsing * Decoding codepoint only once. * Closes #429 * using leading_zeros to make code less cryptic * lookup in a table --- src/tokenizer/mod.rs | 113 ++------ src/tokenizer/ngram_tokenizer.rs | 460 ++++++++++++++++++++++++++----- 2 files changed, 411 insertions(+), 162 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index dd8eb18dd..de5ec9f00 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -157,35 +157,34 @@ pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; -/// This is a function that can be used in tests and doc tests -/// to assert a token's correctness. -/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the -/// public api? -pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) { - assert_eq!( - token.position, position, - "expected position {} but {:?}", - position, token - ); - assert_eq!(token.text, text, "expected text {} but {:?}", text, token); - assert_eq!( - token.offset_from, from, - "expected offset_from {} but {:?}", - from, token - ); - assert_eq!( - token.offset_to, to, - "expected offset_to {} but {:?}", - to, token - ); -} #[cfg(test)] -pub mod test { - use super::assert_token; +pub mod tests { use super::Token; use super::TokenizerManager; + + /// This is a function that can be used in tests and doc tests + /// to assert a token's correctness. + pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) { + assert_eq!( + token.position, position, + "expected position {} but {:?}", + position, token + ); + assert_eq!(token.text, text, "expected text {} but {:?}", text, token); + assert_eq!( + token.offset_from, from, + "expected offset_from {} but {:?}", + from, token + ); + assert_eq!( + token.offset_to, to, + "expected offset_to {} but {:?}", + to, token + ); + } + #[test] fn test_raw_tokenizer() { let tokenizer_manager = TokenizerManager::default(); @@ -224,72 +223,6 @@ pub mod test { assert_token(&tokens[3], 3, "payer", 17, 22); } - #[test] - fn test_ngram_tokenizer() { - use super::{LowerCaser, NgramTokenizer}; - use tokenizer::tokenizer::TokenStream; - use tokenizer::tokenizer::Tokenizer; - - let tokenizer_manager = TokenizerManager::default(); - tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false)); - tokenizer_manager.register( - "ngram3", - NgramTokenizer::new(3, 3, false).filter(LowerCaser), - ); - tokenizer_manager.register( - "edgegram5", - NgramTokenizer::new(2, 5, true).filter(LowerCaser), - ); - - let tokenizer = NgramTokenizer::new(1, 2, false); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - tokenizer.token_stream("hello").process(&mut add_token); - } - assert_eq!(tokens.len(), 9); - assert_token(&tokens[0], 0, "h", 0, 1); - assert_token(&tokens[1], 0, "he", 0, 2); - assert_token(&tokens[2], 1, "e", 1, 2); - assert_token(&tokens[3], 1, "el", 1, 3); - assert_token(&tokens[4], 2, "l", 2, 3); - assert_token(&tokens[5], 2, "ll", 2, 4); - assert_token(&tokens[6], 3, "l", 3, 4); - assert_token(&tokens[7], 3, "lo", 3, 5); - assert_token(&tokens[8], 4, "o", 4, 5); - - let tokenizer = tokenizer_manager.get("ngram3").unwrap(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - tokenizer.token_stream("Hello").process(&mut add_token); - } - assert_eq!(tokens.len(), 3); - assert_token(&tokens[0], 0, "hel", 0, 3); - assert_token(&tokens[1], 1, "ell", 1, 4); - assert_token(&tokens[2], 2, "llo", 2, 5); - - let tokenizer = tokenizer_manager.get("edgegram5").unwrap(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - tokenizer - .token_stream("Frankenstein") - .process(&mut add_token); - } - assert_eq!(tokens.len(), 4); - assert_token(&tokens[0], 0, "fr", 0, 2); - assert_token(&tokens[1], 0, "fra", 0, 3); - assert_token(&tokens[2], 0, "fran", 0, 4); - assert_token(&tokens[3], 0, "frank", 0, 5); - } - #[test] fn test_tokenizer_empty() { let tokenizer_manager = TokenizerManager::default(); diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 6d615f848..ebd1ece98 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -2,14 +2,15 @@ use super::{Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting words into n-grams of the given size(s) /// -/// With this tokenizer, the `position` field expresses the starting offset of the ngram -/// rather than the `token` offset. +/// With this tokenizer, the `position` is always 0. +/// Beware however, in presence of multiple value for the same field, +/// the position will be `POSITION_GAP * index of value`. /// /// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false) /// /// | Term | he | hel | el | ell | ll | llo | lo | /// |----------|-----|-----|-----|-----|-----|-----|----| -/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 | +/// | Position | 0 | 0 | 0 | 0 | 0 | 0 | 0 | /// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5| /// /// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**) @@ -19,24 +20,63 @@ use super::{Token, TokenStream, Tokenizer}; /// | Position | 0 | 0 | 0 | 0 | /// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 | /// +/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**) +/// +/// | Term | hε | hεl | hεll | hεllo | +/// |----------|-----|-----|-------|-------| +/// | Position | 0 | 0 | 0 | 0 | +/// | Offsets | 0,3 | 0,4 | 0,5 | 0,6 | +/// /// # Example /// /// ``` -/// extern crate tantivy; +/// # extern crate tantivy; /// use tantivy::tokenizer::*; -/// use tantivy::tokenizer::assert_token; -/// /// # fn main() { /// let tokenizer = NgramTokenizer::new(2, 3, false); /// let mut stream = tokenizer.token_stream("hello"); -/// -/// assert_token(stream.next().unwrap(), 0, "he", 0, 2); -/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3); -/// assert_token(stream.next().unwrap(), 1, "el", 1, 3); -/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4); -/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4); -/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5); -/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5); +/// { +/// let token = stream.next().unwrap(); +/// assert_eq!(token.text, "he"); +/// assert_eq!(token.offset_from, 0); +/// assert_eq!(token.offset_to, 2); +/// } +/// { +/// let token = stream.next().unwrap(); +/// assert_eq!(token.text, "hel"); +/// assert_eq!(token.offset_from, 0); +/// assert_eq!(token.offset_to, 3); +/// } +/// { +/// let token = stream.next().unwrap(); +/// assert_eq!(token.text, "el"); +/// assert_eq!(token.offset_from, 1); +/// assert_eq!(token.offset_to, 3); +/// } +/// { +/// let token = stream.next().unwrap(); +/// assert_eq!(token.text, "ell"); +/// assert_eq!(token.offset_from, 1); +/// assert_eq!(token.offset_to, 4); +/// } +/// { +/// let token = stream.next().unwrap(); +/// assert_eq!(token.text, "ll"); +/// assert_eq!(token.offset_from, 2); +/// assert_eq!(token.offset_to, 4); +/// } +/// { +/// let token = stream.next().unwrap(); +/// assert_eq!(token.text, "llo"); +/// assert_eq!(token.offset_from, 2); +/// assert_eq!(token.offset_to, 5); +/// } +/// { +/// let token = stream.next().unwrap(); +/// assert_eq!(token.text, "lo"); +/// assert_eq!(token.offset_from, 3); +/// assert_eq!(token.offset_to, 5); +/// } /// assert!(stream.next().is_none()); /// # } /// ``` @@ -58,23 +98,37 @@ impl NgramTokenizer { min_gram <= max_gram, "min_gram must not be greater than max_gram" ); - NgramTokenizer { min_gram, max_gram, prefix_only, } } + + /// Create a `NGramTokenizer` which generates tokens for all inner ngrams. + /// + /// This is as opposed to only prefix ngrams . + pub fn all_ngrams(min_gram: usize, max_gram:usize) -> NgramTokenizer { + Self::new(min_gram, max_gram, false) + } + + /// Create a `NGramTokenizer` which only generates tokens for the + /// prefix ngrams. + pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer { + Self::new(min_gram, max_gram, true) + } } + +/// TokenStream associate to the `NgramTokenizer` pub struct NgramTokenStream<'a> { - text: &'a str, - position: usize, - text_length: usize, - token: Token, - min_gram: usize, - max_gram: usize, - gram_size: usize, + /// parameters + ngram_charidx_iterator: StutteringIterator>, + /// true if the NgramTokenStream is in prefix mode. prefix_only: bool, + /// input + text: &'a str, + /// output + token: Token, } impl<'a> Tokenizer<'a> for NgramTokenizer { @@ -82,65 +136,28 @@ impl<'a> Tokenizer<'a> for NgramTokenizer { fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { NgramTokenStream { - text, - position: 0, - text_length: text.len(), - token: Token::default(), - min_gram: self.min_gram, - max_gram: self.max_gram, + ngram_charidx_iterator: StutteringIterator::new( + CodepointFrontiers::for_str(text), + self.min_gram, + self.max_gram), prefix_only: self.prefix_only, - gram_size: self.min_gram, + text, + token: Token::default(), } } } -impl<'a> NgramTokenStream<'a> { - /// Get the next set of token options - /// cycle through 1,2 (min..=max) - /// returning None if processing should stop - fn chomp(&mut self) -> Option<(usize, usize)> { - // Have we exceeded the bounds of the text we are indexing? - if self.gram_size > self.max_gram { - if self.prefix_only { - return None; - } - - // since we aren't just processing edges - // we need to reset the gram size - self.gram_size = self.min_gram; - - // and move down the chain of letters - self.position += 1; - } - - let result = if (self.position + self.gram_size) <= self.text_length { - Some((self.position, self.gram_size)) - } else { - None - }; - - // increase the gram size for the next pass - self.gram_size += 1; - - result - } -} - impl<'a> TokenStream for NgramTokenStream<'a> { fn advance(&mut self) -> bool { - // clear out working token text - self.token.text.clear(); - - if let Some((position, size)) = self.chomp() { - self.token.position = position; - let offset_from = position; - let offset_to = offset_from + size; - + if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() { + if self.prefix_only && offset_from > 0 { + return false; + } + self.token.position = 0; self.token.offset_from = offset_from; self.token.offset_to = offset_to; - + self.token.text.clear(); self.token.text.push_str(&self.text[offset_from..offset_to]); - true } else { false @@ -150,8 +167,307 @@ impl<'a> TokenStream for NgramTokenStream<'a> { fn token(&self) -> &Token { &self.token } - fn token_mut(&mut self) -> &mut Token { &mut self.token } } + + +/// This iterator takes an underlying Iterator +/// and emits all of the pairs `(a,b)` such that +/// a and b are items emitted by the iterator at +/// an interval between `min_gram` and `max_gram`. +/// +/// The elements are emitted in the order of appearance +/// of `a` first, `b` then. +/// +/// See `test_stutterring_iterator` for an example of its +/// output. +struct StutteringIterator { + underlying: T, + min_gram: usize, + max_gram: usize, + + memory: Vec, + cursor: usize, + gram_len: usize +} + +impl StutteringIterator + where T: Iterator { + pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator { + assert!(min_gram > 0); + let memory: Vec = (&mut underlying).take(max_gram + 1).collect(); + if memory.len() <= min_gram { + // returns an empty iterator + StutteringIterator { + underlying, + min_gram: 1, + max_gram: 0, + memory, + cursor: 0, + gram_len: 0, + } + } else { + StutteringIterator { + underlying, + min_gram, + max_gram: memory.len() - 1, + memory, + cursor: 0, + gram_len: min_gram, + } + } + } +} + +impl Iterator for StutteringIterator + where T: Iterator { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if self.gram_len > self.max_gram { + // we have exhausted all options + // starting at `self.memory[self.cursor]`. + // + // Time to advance. + self.gram_len = self.min_gram; + if let Some(next_val) = self.underlying.next() { + self.memory[self.cursor] = next_val; + } else { + self.max_gram -= 1; + } + self.cursor += 1; + if self.cursor >= self.memory.len() { + self.cursor = 0; + } + } + if self.max_gram < self.min_gram { + return None; + } + let start = self.memory[self.cursor % self.memory.len()]; + let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()]; + self.gram_len += 1; + Some((start, stop)) + } +} + + + +/// Emits all of the offsets where a codepoint starts +/// or a codepoint ends. +/// +/// By convention, we emit [0] for the empty string. +struct CodepointFrontiers<'a> { + s: &'a str, + next_el: Option +} + +impl<'a> CodepointFrontiers<'a> { + fn for_str(s: &'a str) -> Self { + CodepointFrontiers { + s, + next_el: Some(0) + } + } +} + +impl<'a> Iterator for CodepointFrontiers<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + self.next_el + .map(|offset| { + if self.s.is_empty() { + self.next_el = None; + } else { + let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]); + self.s = &self.s[first_codepoint_width..]; + self.next_el = Some(offset + first_codepoint_width); + } + offset + }) + } +} + +const CODEPOINT_UTF8_WIDTH: [u8; 16] = [ + 1, 1, 1, 1, + 1, 1, 1, 1, + 2, 2, 2, 2, + 2, 2, 3, 4, +]; + +// Number of bytes to encode a codepoint in UTF-8 given +// the first byte. +// +// To do that we count the number of higher significant bits set to `1`. +fn utf8_codepoint_width(b: u8) -> usize { + let higher_4_bits = (b as usize) >> 4; + CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize +} + +#[cfg(test)] +mod tests { + + use tokenizer::tokenizer::{TokenStream, Tokenizer}; + use super::NgramTokenizer; + use tokenizer::Token; + use tokenizer::tests::assert_token; + use super::CodepointFrontiers; + use super::StutteringIterator; + use super::utf8_codepoint_width; + + fn test_helper(mut tokenizer: T) -> Vec { + let mut tokens: Vec = vec![]; + tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); + tokens + } + + + #[test] + fn test_utf8_codepoint_width() { + // 0xxx + for i in 0..128 { + assert_eq!(utf8_codepoint_width(i), 1); + } + // 110xx + for i in (128 | 64)..(128 | 64 | 32) { + assert_eq!(utf8_codepoint_width(i), 2); + } + // 1110xx + for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) { + assert_eq!(utf8_codepoint_width(i), 3); + } + // 1111xx + for i in (128 | 64 | 32 | 16)..256 { + assert_eq!(utf8_codepoint_width(i as u8), 4); + } + } + + + #[test] + fn test_codepoint_frontiers() { + assert_eq!(CodepointFrontiers::for_str("").collect::>(), vec![0]); + assert_eq!( + CodepointFrontiers::for_str("abcd").collect::>(), + vec![0,1,2,3,4] + ); + assert_eq!( + CodepointFrontiers::for_str("aあ").collect::>(), + vec![0,1,4] + ); + } + + #[test] + fn test_ngram_tokenizer_1_2_false() { + let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello")); + assert_eq!(tokens.len(), 9); + assert_token(&tokens[0], 0, "h", 0, 1); + assert_token(&tokens[1], 0, "he", 0, 2); + assert_token(&tokens[2], 0, "e", 1, 2); + assert_token(&tokens[3], 0, "el", 1, 3); + assert_token(&tokens[4], 0, "l", 2, 3); + assert_token(&tokens[5], 0, "ll", 2, 4); + assert_token(&tokens[6], 0, "l", 3, 4); + assert_token(&tokens[7], 0, "lo", 3, 5); + assert_token(&tokens[8], 0, "o", 4, 5); + } + + #[test] + fn test_ngram_tokenizer_min_max_equal() { + let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello")); + assert_eq!(tokens.len(), 3); + assert_token(&tokens[0], 0, "hel", 0, 3); + assert_token(&tokens[1], 0, "ell", 1, 4); + assert_token(&tokens[2], 0, "llo", 2, 5); + } + + #[test] + fn test_ngram_tokenizer_2_5_prefix() { + let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein")); + assert_eq!(tokens.len(), 4); + assert_token(&tokens[0], 0, "fr", 0, 2); + assert_token(&tokens[1], 0, "fra", 0, 3); + assert_token(&tokens[2], 0, "fran", 0, 4); + assert_token(&tokens[3], 0, "frank", 0, 5); + } + + #[test] + fn test_ngram_non_ascii_1_2() { + let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo")); + assert_eq!(tokens.len(), 9); + assert_token(&tokens[0], 0, "h", 0, 1); + assert_token(&tokens[1], 0, "hε", 0, 3); + assert_token(&tokens[2], 0, "ε", 1, 3); + assert_token(&tokens[3], 0, "εl", 1, 4); + assert_token(&tokens[4], 0, "l", 3, 4); + assert_token(&tokens[5], 0, "ll", 3, 5); + assert_token(&tokens[6], 0, "l", 4, 5); + assert_token(&tokens[7], 0, "lo", 4, 6); + assert_token(&tokens[8], 0, "o", 5, 6); + } + + #[test] + fn test_ngram_non_ascii_2_5_prefix() { + let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo")); + assert_eq!(tokens.len(), 4); + assert_token(&tokens[0], 0, "hε", 0, 3); + assert_token(&tokens[1], 0, "hεl", 0, 4); + assert_token(&tokens[2], 0, "hεll", 0, 5); + assert_token(&tokens[3], 0, "hεllo", 0, 6); + } + + #[test] + fn test_ngram_empty() { + let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream("")); + assert!(tokens.is_empty()); + let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream("")); + assert!(tokens.is_empty()); + } + + + #[test] + #[should_panic(expected = "min_gram must be greater than 0")] + fn test_ngram_min_max_interval_empty() { + test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss")); + } + + #[test] + #[should_panic(expected = "min_gram must not be greater than max_gram")] + fn test_invalid_interval_should_panic_if_smaller() { + NgramTokenizer::all_ngrams(2, 1); + } + + + #[test] + fn test_stutterring_iterator_empty() { + let rg: Vec = vec![0]; + let mut it = StutteringIterator::new(rg.into_iter(), 1, 2); + assert_eq!(it.next(), None); + } + + #[test] + fn test_stutterring_iterator() { + let rg: Vec = (0..10).collect(); + let mut it = StutteringIterator::new(rg.into_iter(), 1, 2); + assert_eq!(it.next(), Some((0, 1))); + assert_eq!(it.next(), Some((0, 2))); + assert_eq!(it.next(), Some((1, 2))); + assert_eq!(it.next(), Some((1, 3))); + assert_eq!(it.next(), Some((2, 3))); + assert_eq!(it.next(), Some((2, 4))); + assert_eq!(it.next(), Some((3, 4))); + assert_eq!(it.next(), Some((3, 5))); + assert_eq!(it.next(), Some((4, 5))); + assert_eq!(it.next(), Some((4, 6))); + assert_eq!(it.next(), Some((5, 6))); + assert_eq!(it.next(), Some((5, 7))); + assert_eq!(it.next(), Some((6, 7))); + assert_eq!(it.next(), Some((6, 8))); + assert_eq!(it.next(), Some((7, 8))); + assert_eq!(it.next(), Some((7, 9))); + assert_eq!(it.next(), Some((8, 9))); + assert_eq!(it.next(), None); + } + +} \ No newline at end of file