From c1defdda05add793dd8df37e77a3eb2ec16908bf Mon Sep 17 00:00:00 2001
From: Adam Reichold <adamreichold@users.noreply.github.com>
Date: Tue, 18 Apr 2023 07:34:30 +0200
Subject: [PATCH] Bump aho-corasick dependency to version 1.0 and adjust to API
 changes (#2002)

* Drop additional Arc-layer as the automaton itself is now cheap-to-clone.
* Drop state ID type parameter as it is not exposed by the library any more.
---
 Cargo.toml                            |  2 +-
 src/tokenizer/split_compound_words.rs | 64 +++++++++++++--------------
 2 files changed, 33 insertions(+), 33 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 13c5ab14f..7ecba5302 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ byteorder = "1.4.3"
 crc32fast = "1.3.2"
 once_cell = "1.10.0"
 regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
-aho-corasick = "0.7"
+aho-corasick = "1.0"
 tantivy-fst = "0.4.0"
 memmap2 = { version = "0.5.3", optional = true }
 lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs
index e79c48bac..e80e6b31f 100644
--- a/src/tokenizer/split_compound_words.rs
+++ b/src/tokenizer/split_compound_words.rs
@@ -1,6 +1,4 @@
-use std::sync::Arc;
-
-use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
+use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
 
 use super::{Token, TokenFilter, TokenStream, Tokenizer};
 
@@ -24,9 +22,12 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
 ///
 /// let tokenizer =
 ///        TextAnalyzer::builder(SimpleTokenizer)
-///        .filter(SplitCompoundWords::from_dictionary([
-///            "dampf", "schiff", "fahrt", "brot", "backen", "automat",
-///        ]))
+///        .filter(
+///            SplitCompoundWords::from_dictionary([
+///                 "dampf", "schiff", "fahrt", "brot", "backen", "automat",
+///            ])
+///            .unwrap()
+///        )
 ///        .build();
 ///
 /// let mut stream = tokenizer.token_stream("dampfschifffahrt");
@@ -42,45 +43,46 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
 ///
 /// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
 #[derive(Clone)]
-pub struct SplitCompoundWords<S: StateID> {
-    dict: Arc<AhoCorasick<S>>,
+pub struct SplitCompoundWords {
+    dict: AhoCorasick,
 }
 
-impl SplitCompoundWords<usize> {
+impl SplitCompoundWords {
     /// Create a filter from a given dictionary.
     ///
     /// The dictionary will be used to construct an [`AhoCorasick`] automaton
     /// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
     /// more control over its construction is required.
-    pub fn from_dictionary<I, P>(dict: I) -> Self
+    pub fn from_dictionary<I, P>(dict: I) -> crate::Result<Self>
     where
         I: IntoIterator<Item = P>,
         P: AsRef<[u8]>,
     {
         let dict = AhoCorasickBuilder::new()
             .match_kind(MatchKind::LeftmostLongest)
-            .build(dict);
+            .build(dict)
+            .map_err(|err| {
+                crate::TantivyError::InvalidArgument(format!(
+                    "Failed to build Aho-Corasick automaton from dictionary: {err}"
+                ))
+            })?;
 
-        Self::from_automaton(dict)
+        Ok(Self::from_automaton(dict))
     }
-}
 
-impl<S: StateID> SplitCompoundWords<S> {
     /// Create a filter from a given automaton.
     ///
     /// The automaton should use one of the leftmost-first match kinds
     /// and it should not be anchored.
-    pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
-        Self {
-            dict: Arc::new(dict),
-        }
+    pub fn from_automaton(dict: AhoCorasick) -> Self {
+        Self { dict }
     }
 }
 
-impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
-    type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T, S>;
+impl TokenFilter for SplitCompoundWords {
+    type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
 
-    fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T, S> {
+    fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
         SplitCompoundWordsFilter {
             dict: self.dict,
             inner: tokenizer,
@@ -89,15 +91,13 @@ impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
 }
 
 #[derive(Clone)]
-pub struct SplitCompoundWordsFilter<T, S: StateID> {
-    dict: Arc<AhoCorasick<S>>,
+pub struct SplitCompoundWordsFilter<T> {
+    dict: AhoCorasick,
     inner: T,
 }
 
-impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
-    for SplitCompoundWordsFilter<T, S>
-{
-    type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>, S>;
+impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
+    type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
 
     fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
         SplitCompoundWordsTokenStream {
@@ -109,14 +109,14 @@ impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
     }
 }
 
-pub struct SplitCompoundWordsTokenStream<T, S: StateID> {
-    dict: Arc<AhoCorasick<S>>,
+pub struct SplitCompoundWordsTokenStream<T> {
+    dict: AhoCorasick,
     tail: T,
     cuts: Vec<usize>,
     parts: Vec<Token>,
 }
 
-impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
+impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
     // Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
     // can fully be split into consecutive matches against `self.dict`.
     fn split(&mut self) {
@@ -152,7 +152,7 @@ impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
     }
 }
 
-impl<T: TokenStream, S: StateID> TokenStream for SplitCompoundWordsTokenStream<T, S> {
+impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
     fn advance(&mut self) -> bool {
         self.parts.pop();
 
@@ -189,7 +189,7 @@ mod tests {
     #[test]
     fn splitting_compound_words_works() {
         let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
-            .filter(SplitCompoundWords::from_dictionary(["foo", "bar"]))
+            .filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap())
             .build();
 
         {