Bump aho-corasick dependency to version 1.0 and adjust to API changes (#2002)

* Drop additional Arc-layer as the automaton itself is now cheap-to-clone.
* Drop state ID type parameter as it is not exposed by the library any more.
This commit is contained in:
Adam Reichold
2023-04-18 07:34:30 +02:00
committed by GitHub
parent e522163a1c
commit c1defdda05
2 changed files with 33 additions and 33 deletions

View File

@@ -20,7 +20,7 @@ byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "0.7"
aho-corasick = "1.0"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }

View File

@@ -1,6 +1,4 @@
use std::sync::Arc;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
@@ -24,9 +22,12 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
///
/// let tokenizer =
/// TextAnalyzer::builder(SimpleTokenizer)
/// .filter(SplitCompoundWords::from_dictionary([
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
/// ]))
/// .filter(
/// SplitCompoundWords::from_dictionary([
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
/// ])
/// .unwrap()
/// )
/// .build();
///
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
@@ -42,45 +43,46 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
///
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
#[derive(Clone)]
pub struct SplitCompoundWords<S: StateID> {
dict: Arc<AhoCorasick<S>>,
pub struct SplitCompoundWords {
dict: AhoCorasick,
}
impl SplitCompoundWords<usize> {
impl SplitCompoundWords {
/// Create a filter from a given dictionary.
///
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
/// more control over its construction is required.
pub fn from_dictionary<I, P>(dict: I) -> Self
pub fn from_dictionary<I, P>(dict: I) -> crate::Result<Self>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let dict = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(dict);
.build(dict)
.map_err(|err| {
crate::TantivyError::InvalidArgument(format!(
"Failed to build Aho-Corasick automaton from dictionary: {err}"
))
})?;
Self::from_automaton(dict)
Ok(Self::from_automaton(dict))
}
}
impl<S: StateID> SplitCompoundWords<S> {
/// Create a filter from a given automaton.
///
/// The automaton should use one of the leftmost-first match kinds
/// and it should not be anchored.
pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
Self {
dict: Arc::new(dict),
}
pub fn from_automaton(dict: AhoCorasick) -> Self {
Self { dict }
}
}
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T, S>;
impl TokenFilter for SplitCompoundWords {
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T, S> {
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
@@ -89,15 +91,13 @@ impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
}
#[derive(Clone)]
pub struct SplitCompoundWordsFilter<T, S: StateID> {
dict: Arc<AhoCorasick<S>>,
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
}
impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
for SplitCompoundWordsFilter<T, S>
{
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>, S>;
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
SplitCompoundWordsTokenStream {
@@ -109,14 +109,14 @@ impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
}
}
pub struct SplitCompoundWordsTokenStream<T, S: StateID> {
dict: Arc<AhoCorasick<S>>,
pub struct SplitCompoundWordsTokenStream<T> {
dict: AhoCorasick,
tail: T,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
@@ -152,7 +152,7 @@ impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
}
}
impl<T: TokenStream, S: StateID> TokenStream for SplitCompoundWordsTokenStream<T, S> {
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
fn advance(&mut self) -> bool {
self.parts.pop();
@@ -189,7 +189,7 @@ mod tests {
#[test]
fn splitting_compound_words_works() {
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]))
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap())
.build();
{