refactor: try upgrade regex-automata (#3575)

* refactor: try upgrade regex-automata

Signed-off-by: tison <wander4096@gmail.com>

* try fix

Signed-off-by: tison <wander4096@gmail.com>

* always check match with next_eoi_state

Signed-off-by: tison <wander4096@gmail.com>

* add a guard to prevent over moving the state

Signed-off-by: tison <wander4096@gmail.com>

* tidy

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
This commit is contained in:
tison
2024-03-26 12:28:14 +08:00
committed by GitHub
parent 62d8bbb10c
commit 7c1c6e8b8c
4 changed files with 61 additions and 26 deletions

13
Cargo.lock generated
View File

@@ -4378,7 +4378,7 @@ dependencies = [
"prost 0.12.3",
"rand",
"regex",
"regex-automata 0.2.0",
"regex-automata 0.4.3",
"snafu",
"tempfile",
"tokio",
@@ -7801,17 +7801,6 @@ dependencies = [
"regex-syntax 0.6.29",
]
[[package]]
name = "regex-automata"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782"
dependencies = [
"fst",
"memchr",
"regex-syntax 0.6.29",
]
[[package]]
name = "regex-automata"
version = "0.4.3"

View File

@@ -125,7 +125,7 @@ prost = "0.12"
raft-engine = { version = "0.4.1", default-features = false }
rand = "0.8"
regex = "1.8"
regex-automata = { version = "0.2", features = ["transducer"] }
regex-automata = { version = "0.4" }
reqwest = { version = "0.11", default-features = false, features = [
"json",
"rustls-tls-native-roots",

View File

@@ -113,7 +113,7 @@ pub enum Error {
#[snafu(display("Failed to parse regex DFA"))]
ParseDFA {
#[snafu(source)]
error: Box<regex_automata::dfa::Error>,
error: Box<regex_automata::dfa::dense::BuildError>,
location: Location,
},

View File

@@ -17,6 +17,10 @@ use std::mem::size_of;
use fst::map::OpBuilder;
use fst::{IntoStreamer, Streamer};
use regex_automata::dfa::dense::DFA;
use regex_automata::dfa::Automaton;
use regex_automata::util::primitives::StateID;
use regex_automata::util::start::Config;
use regex_automata::Anchored;
use snafu::{ensure, ResultExt};
use crate::inverted_index::error::{
@@ -32,7 +36,53 @@ pub struct IntersectionFstApplier {
ranges: Vec<Range>,
/// A list of `Dfa` compiled from regular expression patterns.
dfas: Vec<DFA<Vec<u32>>>,
dfas: Vec<DfaFstAutomaton>,
}
#[derive(Debug)]
struct DfaFstAutomaton(DFA<Vec<u32>>);
impl fst::Automaton for DfaFstAutomaton {
type State = StateID;
#[inline]
fn start(&self) -> Self::State {
let config = Config::new().anchored(Anchored::No);
self.0.start_state(&config).unwrap()
}
#[inline]
fn is_match(&self, state: &Self::State) -> bool {
self.0.is_match_state(*state)
}
#[inline]
fn can_match(&self, state: &Self::State) -> bool {
!self.0.is_dead_state(*state)
}
#[inline]
fn accept_eof(&self, state: &StateID) -> Option<StateID> {
if self.0.is_match_state(*state) {
return Some(*state);
}
Some(self.0.next_eoi_state(*state))
}
#[inline]
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
if self.0.is_match_state(*state) {
return *state;
}
self.0.next_state(*state, byte)
}
}
impl IntersectionFstApplier {
fn new(ranges: Vec<Range>, dfas: Vec<DFA<Vec<u32>>>) -> Self {
let dfas = dfas.into_iter().map(DfaFstAutomaton).collect();
Self { ranges, dfas }
}
}
impl FstApplier for IntersectionFstApplier {
@@ -86,7 +136,7 @@ impl FstApplier for IntersectionFstApplier {
size += self.dfas.capacity() * size_of::<DFA<Vec<u32>>>();
for dfa in &self.dfas {
size += dfa.memory_usage();
size += dfa.0.memory_usage();
}
size
}
@@ -119,7 +169,7 @@ impl IntersectionFstApplier {
}
}
Ok(Self { dfas, ranges })
Ok(Self::new(ranges, dfas))
}
}
@@ -365,18 +415,15 @@ mod tests {
#[test]
fn test_intersection_fst_applier_memory_usage() {
let applier = IntersectionFstApplier {
ranges: vec![],
dfas: vec![],
};
let applier = IntersectionFstApplier::new(vec![], vec![]);
assert_eq!(applier.memory_usage(), 0);
let dfa = DFA::new("^abc$").unwrap();
assert_eq!(dfa.memory_usage(), 320);
let applier = IntersectionFstApplier {
ranges: vec![Range {
let applier = IntersectionFstApplier::new(
vec![Range {
lower: Some(Bound {
value: b"aa".to_vec(),
inclusive: true,
@@ -386,9 +433,8 @@ mod tests {
inclusive: true,
}),
}],
dfas: vec![dfa],
};
vec![dfa],
);
assert_eq!(
applier.memory_usage(),
size_of::<Range>() + 4 + size_of::<DFA<Vec<u32>>>() + 320