Merge pull request #1649 from adamreichold/split-compound-words

RFC: Add dictionary-based SplitCompoundWords token filter.
2026-01-07 17:42:55 +00:00 · 2022-10-27 17:12:48 +08:00
parent 6647362464 cd952429d2
commit 4e46f4f8c4
3 changed files with 255 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ byteorder = "1.4.3"
 crc32fast = "1.3.2"
 once_cell = "1.10.0"
 regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
+aho-corasick = "0.7"
 tantivy-fst = "0.4.0"
 memmap2 = { version = "0.5.3", optional = true }
 lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -126,6 +126,7 @@ mod ngram_tokenizer;
 mod raw_tokenizer;
 mod remove_long;
 mod simple_tokenizer;
+mod split_compound_words;
 mod stemmer;
 mod stop_word_filter;
 mod tokenized_string;
@@ -141,6 +142,7 @@ pub use self::ngram_tokenizer::NgramTokenizer;
 pub use self::raw_tokenizer::RawTokenizer;
 pub use self::remove_long::RemoveLongFilter;
 pub use self::simple_tokenizer::SimpleTokenizer;
+pub use self::split_compound_words::SplitCompoundWords;
 pub use self::stemmer::{Language, Stemmer};
 pub use self::stop_word_filter::StopWordFilter;
 pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
--- a/src/tokenizer/split_compound_words.rs
+++ b/src/tokenizer/split_compound_words.rs
@@ -0,0 +1,252 @@
+use std::sync::Arc;
+
+use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
+
+use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
+
+/// A [`TokenFilter`] which splits compound words into their parts
+/// based on a given dictionary.
+///
+/// Words only will be split if they can be fully decomposed into
+/// consecutive matches into the given dictionary.
+///
+/// This is mostly useful to split [compound nouns][compound] common to many
+/// Germanic languages into their constituents.
+///
+/// # Example
+///
+/// The quality of the dictionary determines the quality of the splits,
+/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
+/// is not split in the following example.
+///
+/// ```rust
+/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
+///
+/// let tokenizer =
+///        TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
+///            "dampf", "schiff", "fahrt", "brot", "backen", "automat",
+///        ]));
+///
+/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
+/// assert_eq!(stream.next().unwrap().text, "dampf");
+/// assert_eq!(stream.next().unwrap().text, "schiff");
+/// assert_eq!(stream.next().unwrap().text, "fahrt");
+/// assert_eq!(stream.next(), None);
+///
+/// let mut stream = tokenizer.token_stream("brotbackautomat");
+/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
+/// assert_eq!(stream.next(), None);
+/// ```
+///
+/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
+#[derive(Clone)]
+pub struct SplitCompoundWords<S: StateID> {
+    dict: Arc<AhoCorasick<S>>,
+}
+
+impl SplitCompoundWords<usize> {
+    /// Create a filter from a given dictionary.
+    ///
+    /// The dictionary will be used to construct an [`AhoCorasick`] automaton
+    /// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
+    /// more control over its construction is required.
+    pub fn from_dictionary<I, P>(dict: I) -> Self
+    where
+        I: IntoIterator<Item = P>,
+        P: AsRef<[u8]>,
+    {
+        let dict = AhoCorasickBuilder::new()
+            .match_kind(MatchKind::LeftmostLongest)
+            .build(dict);
+
+        Self::from_automaton(dict)
+    }
+}
+
+impl<S: StateID> SplitCompoundWords<S> {
+    /// Create a filter from a given automaton.
+    ///
+    /// The automaton should use one of the leftmost-first match kinds
+    /// and it should not be anchored.
+    pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
+        Self {
+            dict: Arc::new(dict),
+        }
+    }
+}
+
+impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
+    fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
+        BoxTokenStream::from(SplitCompoundWordsTokenStream {
+            dict: self.dict.clone(),
+            tail: stream,
+            cuts: Vec::new(),
+            parts: Vec::new(),
+        })
+    }
+}
+
+struct SplitCompoundWordsTokenStream<'a, S: StateID> {
+    dict: Arc<AhoCorasick<S>>,
+    tail: BoxTokenStream<'a>,
+    cuts: Vec<usize>,
+    parts: Vec<Token>,
+}
+
+impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
+    // Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
+    // can fully be split into consecutive matches against `self.dict`.
+    fn split(&mut self) {
+        let token = self.tail.token();
+        let mut text = token.text.as_str();
+
+        self.cuts.clear();
+        let mut pos = 0;
+
+        for match_ in self.dict.find_iter(text) {
+            if pos != match_.start() {
+                break;
+            }
+
+            self.cuts.push(pos);
+            pos = match_.end();
+        }
+
+        if pos == token.text.len() {
+            // Fill `self.parts` in reverse order,
+            // so that `self.parts.pop()` yields
+            // the tokens in their original order.
+            for pos in self.cuts.iter().rev() {
+                let (head, tail) = text.split_at(*pos);
+
+                text = head;
+                self.parts.push(Token {
+                    text: tail.to_owned(),
+                    ..*token
+                });
+            }
+        }
+    }
+}
+
+impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> {
+    fn advance(&mut self) -> bool {
+        self.parts.pop();
+
+        if !self.parts.is_empty() {
+            return true;
+        }
+
+        if !self.tail.advance() {
+            return false;
+        }
+
+        // Will yield either `self.parts.last()` or
+        // `self.tail.token()` if it could not be split.
+        self.split();
+        true
+    }
+
+    fn token(&self) -> &Token {
+        self.parts.last().unwrap_or_else(|| self.tail.token())
+    }
+
+    fn token_mut(&mut self) -> &mut Token {
+        self.parts
+            .last_mut()
+            .unwrap_or_else(|| self.tail.token_mut())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tokenizer::{SimpleTokenizer, TextAnalyzer};
+
+    #[test]
+    fn splitting_compound_words_works() {
+        let tokenizer = TextAnalyzer::from(SimpleTokenizer)
+            .filter(SplitCompoundWords::from_dictionary(["foo", "bar"]));
+
+        {
+            let mut stream = tokenizer.token_stream("");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("foo bar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("foobar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("foobarbaz");
+            assert_eq!(stream.next().unwrap().text, "foobarbaz");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("baz foobar qux");
+            assert_eq!(stream.next().unwrap().text, "baz");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next().unwrap().text, "qux");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("foobar foobar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("foobar foo bar foobar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("foobazbar foo bar foobar");
+            assert_eq!(stream.next().unwrap().text, "foobazbar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("foobar qux foobar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next().unwrap().text, "qux");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next(), None);
+        }
+
+        {
+            let mut stream = tokenizer.token_stream("barfoo");
+            assert_eq!(stream.next().unwrap().text, "bar");
+            assert_eq!(stream.next().unwrap().text, "foo");
+            assert_eq!(stream.next(), None);
+        }
+    }
+}