diff --git a/Cargo.toml b/Cargo.toml index f830a57c9..7f285446e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ byteorder = "1.4.3" crc32fast = "1.3.2" once_cell = "1.10.0" regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] } +aho-corasick = "0.7" tantivy-fst = "0.4.0" memmap2 = { version = "0.5.3", optional = true } lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 145ba04fc..3b511d2a6 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -126,6 +126,7 @@ mod ngram_tokenizer; mod raw_tokenizer; mod remove_long; mod simple_tokenizer; +mod split_compound_words; mod stemmer; mod stop_word_filter; mod tokenized_string; @@ -141,6 +142,7 @@ pub use self::ngram_tokenizer::NgramTokenizer; pub use self::raw_tokenizer::RawTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::simple_tokenizer::SimpleTokenizer; +pub use self::split_compound_words::SplitCompoundWords; pub use self::stemmer::{Language, Stemmer}; pub use self::stop_word_filter::StopWordFilter; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs new file mode 100644 index 000000000..b6063e83d --- /dev/null +++ b/src/tokenizer/split_compound_words.rs @@ -0,0 +1,252 @@ +use std::sync::Arc; + +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID}; + +use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; + +/// A [`TokenFilter`] which splits compound words into their parts +/// based on a given dictionary. +/// +/// Words only will be split if they can be fully decomposed into +/// consecutive matches into the given dictionary. +/// +/// This is mostly useful to split [compound nouns][compound] common to many +/// Germanic languages into their constituents. +/// +/// # Example +/// +/// The quality of the dictionary determines the quality of the splits, +/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat" +/// is not split in the following example. +/// +/// ```rust +/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer}; +/// +/// let tokenizer = +/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([ +/// "dampf", "schiff", "fahrt", "brot", "backen", "automat", +/// ])); +/// +/// let mut stream = tokenizer.token_stream("dampfschifffahrt"); +/// assert_eq!(stream.next().unwrap().text, "dampf"); +/// assert_eq!(stream.next().unwrap().text, "schiff"); +/// assert_eq!(stream.next().unwrap().text, "fahrt"); +/// assert_eq!(stream.next(), None); +/// +/// let mut stream = tokenizer.token_stream("brotbackautomat"); +/// assert_eq!(stream.next().unwrap().text, "brotbackautomat"); +/// assert_eq!(stream.next(), None); +/// ``` +/// +/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics) +#[derive(Clone)] +pub struct SplitCompoundWords { + dict: Arc>, +} + +impl SplitCompoundWords { + /// Create a filter from a given dictionary. + /// + /// The dictionary will be used to construct an [`AhoCorasick`] automaton + /// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if + /// more control over its construction is required. + pub fn from_dictionary(dict: I) -> Self + where + I: IntoIterator, + P: AsRef<[u8]>, + { + let dict = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(dict); + + Self::from_automaton(dict) + } +} + +impl SplitCompoundWords { + /// Create a filter from a given automaton. + /// + /// The automaton should use one of the leftmost-first match kinds + /// and it should not be anchored. + pub fn from_automaton(dict: AhoCorasick) -> Self { + Self { + dict: Arc::new(dict), + } + } +} + +impl TokenFilter for SplitCompoundWords { + fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { + BoxTokenStream::from(SplitCompoundWordsTokenStream { + dict: self.dict.clone(), + tail: stream, + cuts: Vec::new(), + parts: Vec::new(), + }) + } +} + +struct SplitCompoundWordsTokenStream<'a, S: StateID> { + dict: Arc>, + tail: BoxTokenStream<'a>, + cuts: Vec, + parts: Vec, +} + +impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> { + // Will use `self.cuts` to fill `self.parts` if `self.tail.token()` + // can fully be split into consecutive matches against `self.dict`. + fn split(&mut self) { + let token = self.tail.token(); + let mut text = token.text.as_str(); + + self.cuts.clear(); + let mut pos = 0; + + for match_ in self.dict.find_iter(text) { + if pos != match_.start() { + break; + } + + self.cuts.push(pos); + pos = match_.end(); + } + + if pos == token.text.len() { + // Fill `self.parts` in reverse order, + // so that `self.parts.pop()` yields + // the tokens in their original order. + for pos in self.cuts.iter().rev() { + let (head, tail) = text.split_at(*pos); + + text = head; + self.parts.push(Token { + text: tail.to_owned(), + ..*token + }); + } + } + } +} + +impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> { + fn advance(&mut self) -> bool { + self.parts.pop(); + + if !self.parts.is_empty() { + return true; + } + + if !self.tail.advance() { + return false; + } + + // Will yield either `self.parts.last()` or + // `self.tail.token()` if it could not be split. + self.split(); + true + } + + fn token(&self) -> &Token { + self.parts.last().unwrap_or_else(|| self.tail.token()) + } + + fn token_mut(&mut self) -> &mut Token { + self.parts + .last_mut() + .unwrap_or_else(|| self.tail.token_mut()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tokenizer::{SimpleTokenizer, TextAnalyzer}; + + #[test] + fn splitting_compound_words_works() { + let tokenizer = TextAnalyzer::from(SimpleTokenizer) + .filter(SplitCompoundWords::from_dictionary(["foo", "bar"])); + + { + let mut stream = tokenizer.token_stream(""); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("foo bar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("foobar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("foobarbaz"); + assert_eq!(stream.next().unwrap().text, "foobarbaz"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("baz foobar qux"); + assert_eq!(stream.next().unwrap().text, "baz"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next().unwrap().text, "qux"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("foobar foobar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("foobar foo bar foobar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("foobazbar foo bar foobar"); + assert_eq!(stream.next().unwrap().text, "foobazbar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("foobar qux foobar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next().unwrap().text, "qux"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next(), None); + } + + { + let mut stream = tokenizer.token_stream("barfoo"); + assert_eq!(stream.next().unwrap().text, "bar"); + assert_eq!(stream.next().unwrap().text, "foo"); + assert_eq!(stream.next(), None); + } + } +}