From c1defdda05add793dd8df37e77a3eb2ec16908bf Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Tue, 18 Apr 2023 07:34:30 +0200 Subject: [PATCH] Bump aho-corasick dependency to version 1.0 and adjust to API changes (#2002) * Drop additional Arc-layer as the automaton itself is now cheap-to-clone. * Drop state ID type parameter as it is not exposed by the library any more. --- Cargo.toml | 2 +- src/tokenizer/split_compound_words.rs | 64 +++++++++++++-------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 13c5ab14f..7ecba5302 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ byteorder = "1.4.3" crc32fast = "1.3.2" once_cell = "1.10.0" regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] } -aho-corasick = "0.7" +aho-corasick = "1.0" tantivy-fst = "0.4.0" memmap2 = { version = "0.5.3", optional = true } lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true } diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs index e79c48bac..e80e6b31f 100644 --- a/src/tokenizer/split_compound_words.rs +++ b/src/tokenizer/split_compound_words.rs @@ -1,6 +1,4 @@ -use std::sync::Arc; - -use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID}; +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; use super::{Token, TokenFilter, TokenStream, Tokenizer}; @@ -24,9 +22,12 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// /// let tokenizer = /// TextAnalyzer::builder(SimpleTokenizer) -/// .filter(SplitCompoundWords::from_dictionary([ -/// "dampf", "schiff", "fahrt", "brot", "backen", "automat", -/// ])) +/// .filter( +/// SplitCompoundWords::from_dictionary([ +/// "dampf", "schiff", "fahrt", "brot", "backen", "automat", +/// ]) +/// .unwrap() +/// ) /// .build(); /// /// let mut stream = tokenizer.token_stream("dampfschifffahrt"); @@ -42,45 +43,46 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// /// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics) #[derive(Clone)] -pub struct SplitCompoundWords { - dict: Arc>, +pub struct SplitCompoundWords { + dict: AhoCorasick, } -impl SplitCompoundWords { +impl SplitCompoundWords { /// Create a filter from a given dictionary. /// /// The dictionary will be used to construct an [`AhoCorasick`] automaton /// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if /// more control over its construction is required. - pub fn from_dictionary(dict: I) -> Self + pub fn from_dictionary(dict: I) -> crate::Result where I: IntoIterator, P: AsRef<[u8]>, { let dict = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build(dict); + .build(dict) + .map_err(|err| { + crate::TantivyError::InvalidArgument(format!( + "Failed to build Aho-Corasick automaton from dictionary: {err}" + )) + })?; - Self::from_automaton(dict) + Ok(Self::from_automaton(dict)) } -} -impl SplitCompoundWords { /// Create a filter from a given automaton. /// /// The automaton should use one of the leftmost-first match kinds /// and it should not be anchored. - pub fn from_automaton(dict: AhoCorasick) -> Self { - Self { - dict: Arc::new(dict), - } + pub fn from_automaton(dict: AhoCorasick) -> Self { + Self { dict } } } -impl TokenFilter for SplitCompoundWords { - type Tokenizer = SplitCompoundWordsFilter; +impl TokenFilter for SplitCompoundWords { + type Tokenizer = SplitCompoundWordsFilter; - fn transform(self, tokenizer: T) -> SplitCompoundWordsFilter { + fn transform(self, tokenizer: T) -> SplitCompoundWordsFilter { SplitCompoundWordsFilter { dict: self.dict, inner: tokenizer, @@ -89,15 +91,13 @@ impl TokenFilter for SplitCompoundWords { } #[derive(Clone)] -pub struct SplitCompoundWordsFilter { - dict: Arc>, +pub struct SplitCompoundWordsFilter { + dict: AhoCorasick, inner: T, } -impl Tokenizer - for SplitCompoundWordsFilter -{ - type TokenStream<'a> = SplitCompoundWordsTokenStream, S>; +impl Tokenizer for SplitCompoundWordsFilter { + type TokenStream<'a> = SplitCompoundWordsTokenStream>; fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { SplitCompoundWordsTokenStream { @@ -109,14 +109,14 @@ impl Tokenizer } } -pub struct SplitCompoundWordsTokenStream { - dict: Arc>, +pub struct SplitCompoundWordsTokenStream { + dict: AhoCorasick, tail: T, cuts: Vec, parts: Vec, } -impl SplitCompoundWordsTokenStream { +impl SplitCompoundWordsTokenStream { // Will use `self.cuts` to fill `self.parts` if `self.tail.token()` // can fully be split into consecutive matches against `self.dict`. fn split(&mut self) { @@ -152,7 +152,7 @@ impl SplitCompoundWordsTokenStream { } } -impl TokenStream for SplitCompoundWordsTokenStream { +impl TokenStream for SplitCompoundWordsTokenStream { fn advance(&mut self) -> bool { self.parts.pop(); @@ -189,7 +189,7 @@ mod tests { #[test] fn splitting_compound_words_works() { let tokenizer = TextAnalyzer::builder(SimpleTokenizer) - .filter(SplitCompoundWords::from_dictionary(["foo", "bar"])) + .filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap()) .build(); {