mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-26 20:19:57 +00:00
Bump aho-corasick dependency to version 1.0 and adjust to API changes (#2002)
* Drop additional Arc-layer as the automaton itself is now cheap-to-clone. * Drop state ID type parameter as it is not exposed by the library any more.
This commit is contained in:
@@ -20,7 +20,7 @@ byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
aho-corasick = "0.7"
|
||||
aho-corasick = "1.0"
|
||||
tantivy-fst = "0.4.0"
|
||||
memmap2 = { version = "0.5.3", optional = true }
|
||||
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
@@ -24,9 +22,12 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
///
|
||||
/// let tokenizer =
|
||||
/// TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// .filter(SplitCompoundWords::from_dictionary([
|
||||
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
|
||||
/// ]))
|
||||
/// .filter(
|
||||
/// SplitCompoundWords::from_dictionary([
|
||||
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
|
||||
/// ])
|
||||
/// .unwrap()
|
||||
/// )
|
||||
/// .build();
|
||||
///
|
||||
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
|
||||
@@ -42,45 +43,46 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
///
|
||||
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
|
||||
#[derive(Clone)]
|
||||
pub struct SplitCompoundWords<S: StateID> {
|
||||
dict: Arc<AhoCorasick<S>>,
|
||||
pub struct SplitCompoundWords {
|
||||
dict: AhoCorasick,
|
||||
}
|
||||
|
||||
impl SplitCompoundWords<usize> {
|
||||
impl SplitCompoundWords {
|
||||
/// Create a filter from a given dictionary.
|
||||
///
|
||||
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
|
||||
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
|
||||
/// more control over its construction is required.
|
||||
pub fn from_dictionary<I, P>(dict: I) -> Self
|
||||
pub fn from_dictionary<I, P>(dict: I) -> crate::Result<Self>
|
||||
where
|
||||
I: IntoIterator<Item = P>,
|
||||
P: AsRef<[u8]>,
|
||||
{
|
||||
let dict = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(dict);
|
||||
.build(dict)
|
||||
.map_err(|err| {
|
||||
crate::TantivyError::InvalidArgument(format!(
|
||||
"Failed to build Aho-Corasick automaton from dictionary: {err}"
|
||||
))
|
||||
})?;
|
||||
|
||||
Self::from_automaton(dict)
|
||||
Ok(Self::from_automaton(dict))
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: StateID> SplitCompoundWords<S> {
|
||||
/// Create a filter from a given automaton.
|
||||
///
|
||||
/// The automaton should use one of the leftmost-first match kinds
|
||||
/// and it should not be anchored.
|
||||
pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
|
||||
Self {
|
||||
dict: Arc::new(dict),
|
||||
}
|
||||
pub fn from_automaton(dict: AhoCorasick) -> Self {
|
||||
Self { dict }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
|
||||
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T, S>;
|
||||
impl TokenFilter for SplitCompoundWords {
|
||||
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T, S> {
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
|
||||
SplitCompoundWordsFilter {
|
||||
dict: self.dict,
|
||||
inner: tokenizer,
|
||||
@@ -89,15 +91,13 @@ impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SplitCompoundWordsFilter<T, S: StateID> {
|
||||
dict: Arc<AhoCorasick<S>>,
|
||||
pub struct SplitCompoundWordsFilter<T> {
|
||||
dict: AhoCorasick,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
|
||||
for SplitCompoundWordsFilter<T, S>
|
||||
{
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>, S>;
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
@@ -109,14 +109,14 @@ impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<T, S: StateID> {
|
||||
dict: Arc<AhoCorasick<S>>,
|
||||
pub struct SplitCompoundWordsTokenStream<T> {
|
||||
dict: AhoCorasick,
|
||||
tail: T,
|
||||
cuts: Vec<usize>,
|
||||
parts: Vec<Token>,
|
||||
}
|
||||
|
||||
impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
|
||||
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
|
||||
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
|
||||
// can fully be split into consecutive matches against `self.dict`.
|
||||
fn split(&mut self) {
|
||||
@@ -152,7 +152,7 @@ impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream, S: StateID> TokenStream for SplitCompoundWordsTokenStream<T, S> {
|
||||
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.parts.pop();
|
||||
|
||||
@@ -189,7 +189,7 @@ mod tests {
|
||||
#[test]
|
||||
fn splitting_compound_words_works() {
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]))
|
||||
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap())
|
||||
.build();
|
||||
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user