use std::sync::Arc; use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID}; use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; /// A [`TokenFilter`] which splits compound words into their parts /// based on a given dictionary. /// /// Words only will be split if they can be fully decomposed into /// consecutive matches into the given dictionary. /// /// This is mostly useful to split [compound nouns][compound] common to many /// Germanic languages into their constituents. /// /// # Example /// /// The quality of the dictionary determines the quality of the splits, /// e.g. the missing stem "back" of "backen" implies that "brotbackautomat" /// is not split in the following example. /// /// ```rust /// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer}; /// /// let tokenizer = /// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([ /// "dampf", "schiff", "fahrt", "brot", "backen", "automat", /// ])); /// /// let mut stream = tokenizer.token_stream("dampfschifffahrt"); /// assert_eq!(stream.next().unwrap().text, "dampf"); /// assert_eq!(stream.next().unwrap().text, "schiff"); /// assert_eq!(stream.next().unwrap().text, "fahrt"); /// assert_eq!(stream.next(), None); /// /// let mut stream = tokenizer.token_stream("brotbackautomat"); /// assert_eq!(stream.next().unwrap().text, "brotbackautomat"); /// assert_eq!(stream.next(), None); /// ``` /// /// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics) #[derive(Clone)] pub struct SplitCompoundWords { dict: Arc>, } impl SplitCompoundWords { /// Create a filter from a given dictionary. /// /// The dictionary will be used to construct an [`AhoCorasick`] automaton /// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if /// more control over its construction is required. pub fn from_dictionary(dict: I) -> Self where I: IntoIterator, P: AsRef<[u8]>, { let dict = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) .build(dict); Self::from_automaton(dict) } } impl SplitCompoundWords { /// Create a filter from a given automaton. /// /// The automaton should use one of the leftmost-first match kinds /// and it should not be anchored. pub fn from_automaton(dict: AhoCorasick) -> Self { Self { dict: Arc::new(dict), } } } impl TokenFilter for SplitCompoundWords { fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { BoxTokenStream::from(SplitCompoundWordsTokenStream { dict: self.dict.clone(), tail: stream, cuts: Vec::new(), parts: Vec::new(), }) } } struct SplitCompoundWordsTokenStream<'a, S: StateID> { dict: Arc>, tail: BoxTokenStream<'a>, cuts: Vec, parts: Vec, } impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> { // Will use `self.cuts` to fill `self.parts` if `self.tail.token()` // can fully be split into consecutive matches against `self.dict`. fn split(&mut self) { let token = self.tail.token(); let mut text = token.text.as_str(); self.cuts.clear(); let mut pos = 0; for match_ in self.dict.find_iter(text) { if pos != match_.start() { break; } self.cuts.push(pos); pos = match_.end(); } if pos == token.text.len() { // Fill `self.parts` in reverse order, // so that `self.parts.pop()` yields // the tokens in their original order. for pos in self.cuts.iter().rev() { let (head, tail) = text.split_at(*pos); text = head; self.parts.push(Token { text: tail.to_owned(), ..*token }); } } } } impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> { fn advance(&mut self) -> bool { self.parts.pop(); if !self.parts.is_empty() { return true; } if !self.tail.advance() { return false; } // Will yield either `self.parts.last()` or // `self.tail.token()` if it could not be split. self.split(); true } fn token(&self) -> &Token { self.parts.last().unwrap_or_else(|| self.tail.token()) } fn token_mut(&mut self) -> &mut Token { self.parts .last_mut() .unwrap_or_else(|| self.tail.token_mut()) } } #[cfg(test)] mod tests { use super::*; use crate::tokenizer::{SimpleTokenizer, TextAnalyzer}; #[test] fn splitting_compound_words_works() { let tokenizer = TextAnalyzer::from(SimpleTokenizer) .filter(SplitCompoundWords::from_dictionary(["foo", "bar"])); { let mut stream = tokenizer.token_stream(""); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("foo bar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("foobar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("foobarbaz"); assert_eq!(stream.next().unwrap().text, "foobarbaz"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("baz foobar qux"); assert_eq!(stream.next().unwrap().text, "baz"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next().unwrap().text, "qux"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("foobar foobar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("foobar foo bar foobar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("foobazbar foo bar foobar"); assert_eq!(stream.next().unwrap().text, "foobazbar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("foobar qux foobar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next().unwrap().text, "qux"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next(), None); } { let mut stream = tokenizer.token_stream("barfoo"); assert_eq!(stream.next().unwrap().text, "bar"); assert_eq!(stream.next().unwrap().text, "foo"); assert_eq!(stream.next(), None); } } }