mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
@@ -14,4 +14,4 @@ edition = "2018"
|
||||
[dependencies]
|
||||
combine = {version="4", default-features=false, features=[] }
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
|
||||
|
||||
@@ -16,9 +16,9 @@ use crate::Occur;
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||
// special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
||||
];
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#;
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#;
|
||||
|
||||
/// Parses a field_name
|
||||
/// A field name must have at least one character and be followed by a colon.
|
||||
@@ -120,22 +120,36 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
|
||||
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
phrase.or(word())
|
||||
negative_number().or(phrase.or(word()))
|
||||
}
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
slop,
|
||||
})
|
||||
}
|
||||
|
||||
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
let slop =
|
||||
(char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::<u32>() {
|
||||
Ok(d) => Ok(d),
|
||||
_ => Err(StringStreamError::UnexpectedParse),
|
||||
});
|
||||
optional(slop).map(|slop| match slop {
|
||||
Some(d) => d,
|
||||
_ => 0,
|
||||
})
|
||||
}
|
||||
|
||||
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let term_default_field = term_val().map(|phrase| UserInputLiteral {
|
||||
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
slop,
|
||||
});
|
||||
|
||||
attempt(term_query())
|
||||
.or(term_default_field)
|
||||
.map(UserInputLeaf::from)
|
||||
@@ -522,18 +536,10 @@ mod test {
|
||||
super::field_name().parse(".my.field.name:a"),
|
||||
Ok((".my.field.name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\ field:a"#),
|
||||
Ok(("my field".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"にんじん:a"#),
|
||||
Ok(("にんじん".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("my\\ field\\ name:a"),
|
||||
Ok(("my field name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\field:a"#),
|
||||
Ok((r#"my\field"#.to_string(), "a"))
|
||||
@@ -562,6 +568,17 @@ mod test {
|
||||
super::field_name().parse("_my_field:a"),
|
||||
Ok(("_my_field".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("~my~field:a"),
|
||||
Ok(("~my~field".to_string(), "a"))
|
||||
);
|
||||
for special_char in SPECIAL_CHARS.iter() {
|
||||
let query = &format!("\\{special_char}my\\{special_char}field:a");
|
||||
assert_eq!(
|
||||
super::field_name().parse(&query),
|
||||
Ok((format!("{special_char}my{special_char}field"), "a"))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -714,4 +731,22 @@ mod test {
|
||||
);
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slop() {
|
||||
assert!(parse_to_ast().parse("\"a b\"~").is_err());
|
||||
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
|
||||
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
|
||||
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
|
||||
|
||||
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
|
||||
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
|
||||
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
|
||||
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
|
||||
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
|
||||
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
|
||||
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
|
||||
test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300");
|
||||
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,14 +40,19 @@ impl Debug for UserInputLeaf {
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
pub slop: u32,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match self.field_name {
|
||||
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
|
||||
None => write!(formatter, "\"{}\"", self.phrase),
|
||||
if let Some(ref field) = self.field_name {
|
||||
write!(formatter, "\"{}\":", field)?;
|
||||
}
|
||||
write!(formatter, "\"{}\"", self.phrase)?;
|
||||
if self.slop > 0 {
|
||||
write!(formatter, "~{}", self.slop)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -43,7 +43,12 @@ impl PhraseQuery {
|
||||
/// Creates a new `PhraseQuery` given a list of terms and their offsets.
|
||||
///
|
||||
/// Can be used to provide custom offset for each term.
|
||||
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery {
|
||||
pub fn new_with_offset(terms: Vec<(usize, Term)>) -> PhraseQuery {
|
||||
PhraseQuery::new_with_offset_and_slop(terms, 0)
|
||||
}
|
||||
|
||||
/// Creates a new `PhraseQuery` given a list of terms, their offsets and a slop
|
||||
pub fn new_with_offset_and_slop(mut terms: Vec<(usize, Term)>, slop: u32) -> PhraseQuery {
|
||||
assert!(
|
||||
terms.len() > 1,
|
||||
"A phrase query is required to have strictly more than one term."
|
||||
@@ -57,7 +62,7 @@ impl PhraseQuery {
|
||||
PhraseQuery {
|
||||
field,
|
||||
phrase_terms: terms,
|
||||
slop: 0,
|
||||
slop,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::Score;
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<(usize, Term)>),
|
||||
Phrase(Vec<(usize, Term)>, u32),
|
||||
Range {
|
||||
field: Field,
|
||||
value_type: Type,
|
||||
@@ -74,7 +74,14 @@ impl fmt::Debug for LogicalLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
|
||||
LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
|
||||
LogicalLiteral::Phrase(ref terms, slop) => {
|
||||
write!(formatter, "\"{:?}\"", terms)?;
|
||||
if slop > 0 {
|
||||
write!(formatter, "~{:?}", slop)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Range {
|
||||
ref lower,
|
||||
ref upper,
|
||||
|
||||
@@ -168,6 +168,9 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
|
||||
/// It is also possible to define a boost for a some specific field, at the query parser level.
|
||||
/// (See [`set_boost(...)`](#method.set_field_boost) ). Typically you may want to boost a title
|
||||
/// field.
|
||||
///
|
||||
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching
|
||||
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
@@ -405,6 +408,7 @@ impl QueryParser {
|
||||
field: Field,
|
||||
json_path: &str,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
@@ -461,6 +465,7 @@ impl QueryParser {
|
||||
field_name,
|
||||
field,
|
||||
phrase,
|
||||
slop,
|
||||
&text_analyzer,
|
||||
index_record_option,
|
||||
)?
|
||||
@@ -626,7 +631,9 @@ impl QueryParser {
|
||||
self.compute_path_triplets_for_literal(&literal)?;
|
||||
let mut asts: Vec<LogicalAst> = Vec::new();
|
||||
for (field, json_path, phrase) in term_phrases {
|
||||
for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? {
|
||||
for ast in
|
||||
self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
|
||||
{
|
||||
// Apply some field specific boost defined at the query parser level.
|
||||
let boost = self.field_boost(field);
|
||||
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
|
||||
@@ -670,9 +677,9 @@ impl QueryParser {
|
||||
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
|
||||
match logical_literal {
|
||||
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
|
||||
LogicalLiteral::Phrase(term_with_offsets) => {
|
||||
Box::new(PhraseQuery::new_with_offset(term_with_offsets))
|
||||
}
|
||||
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
|
||||
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
|
||||
),
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
@@ -689,6 +696,7 @@ fn generate_literals_for_str(
|
||||
field_name: &str,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
index_record_option: IndexRecordOption,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
@@ -710,7 +718,7 @@ fn generate_literals_for_str(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
Ok(Some(LogicalLiteral::Phrase(terms, slop)))
|
||||
}
|
||||
|
||||
fn generate_literals_for_json_object(
|
||||
@@ -741,7 +749,7 @@ fn generate_literals_for_json_object(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
logical_literals.push(LogicalLiteral::Phrase(terms));
|
||||
logical_literals.push(LogicalLiteral::Phrase(terms, 0));
|
||||
Ok(logical_literals)
|
||||
}
|
||||
|
||||
@@ -1493,4 +1501,23 @@ mod test {
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_slop() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"\"a b\"~0",
|
||||
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]" "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]")"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"\"a b\"~2",
|
||||
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]"~2 "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]"~2)"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b~4\"~2",
|
||||
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b")), (2, Term(type=Str, field=0, "4"))]"~2"#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user