diff --git a/Cargo.toml b/Cargo.toml index a22037c94..d7864be30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ byteorder = "1.0" crc32fast = "1.2.0" once_cell = "1.0" regex ={version = "1.3.0", default-features = false, features = ["std"]} -tantivy-fst = "0.2.1" +tantivy-fst = {path="../tantivy-fst", version="0.3"} memmap = {version = "0.7", optional=true} lz4 = {version="1.20", optional=true} snap = "1" @@ -29,7 +29,7 @@ serde = {version="1.0", features=["derive"]} serde_json = "1.0" num_cpus = "1.2" fs2={version="0.4", optional=true} -levenshtein_automata = "0.1" +levenshtein_automata = "0.2" notify = {version="4", optional=true} uuid = { version = "0.8", features = ["v4", "serde"] } crossbeam = "0.7" diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 3bff39d50..1d84b246f 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -2,10 +2,36 @@ use crate::query::{AutomatonWeight, Query, Weight}; use crate::schema::Term; use crate::Searcher; use crate::TantivyError::InvalidArgument; -use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA}; +use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA}; use once_cell::sync::Lazy; use std::collections::HashMap; use std::ops::Range; +use tantivy_fst::Automaton; + +struct DFAWrapper(pub DFA); + +impl Automaton for DFAWrapper { + type State = u32; + + fn start(&self) -> Self::State { + self.0.initial_state() + } + + fn is_match(&self, state: &Self::State) -> bool { + match self.0.distance(*state) { + Distance::Exact(_) => true, + Distance::AtLeast(_) => false, + } + } + + fn can_match(&self, state: &u32) -> bool { + state != levenshtein_automata::SINK_STATE + } + + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + self.0.transition(*state, byte) + } +} /// A range of Levenshtein distances that we will build DFAs for our terms /// The computation is exponential, so best keep it to low single digits @@ -101,7 +127,7 @@ impl FuzzyTermQuery { } } - fn specialized_weight(&self) -> crate::Result> { + fn specialized_weight(&self) -> crate::Result> { // LEV_BUILDER is a HashMap, whose `get` method returns an Option match LEV_BUILDER.get(&(self.distance, false)) { // Unwrap the option and build the Ok(AutomatonWeight) @@ -111,7 +137,10 @@ impl FuzzyTermQuery { } else { automaton_builder.build_dfa(self.term.text()) }; - Ok(AutomatonWeight::new(self.term.field(), automaton)) + Ok(AutomatonWeight::new( + self.term.field(), + DFAWrapper(automaton), + )) } None => Err(InvalidArgument(format!( "Levenshtein distance of {} is not allowed. Choose a value in the {:?} range", diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index ec8bdc99e..56b0fd122 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -55,8 +55,8 @@ pub enum QueryParserError { /// The tokenizer for the given field is unknown /// The two argument strings are the name of the field, the name of the tokenizer #[fail( - display = "The tokenizer '{:?}' for the field '{:?}' is unknown", - _0, _1 + display = "The tokenizer '{:?}' for the field '{:?}' is unknown", + _0, _1 )] UnknownTokenizer(String, String), /// The query contains a range query with a phrase as one of the bounds. @@ -1049,7 +1049,7 @@ mod test { test_parse_query_to_logical_ast_helper( "title:a AND title:b", "(+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98]))", - default_conjunction + default_conjunction, ); } } @@ -1060,9 +1060,8 @@ mod test { test_parse_query_to_logical_ast_helper( "title:a OR title:b", "(Term(field=0,bytes=[97]) Term(field=0,bytes=[98]))", - default_conjunction + default_conjunction, ); } } - }