Change in the query grammar. (#2050)

* Change in the query grammar.

Quotation mark can now be used for phrase queries.
The delimiter is part of the `UserInputLeaf`.
That information is meant to be used in Quickwit to solve #3364.

This PR also adds support for quotation marks escaping in phrase
queries.

* Apply suggestions from code review
This commit is contained in:
Paul Masurel
2023-05-19 12:07:10 +09:00
committed by GitHub
parent 04562c0318
commit 62709b8094
4 changed files with 148 additions and 78 deletions

View File

@@ -7,7 +7,9 @@ use combine::parser::Parser;
pub use crate::occur::Occur;
use crate::query_grammar::parse_to_ast;
pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
pub use crate::user_input_ast::{
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
};
pub struct Error;

View File

@@ -5,13 +5,14 @@ use combine::parser::range::{take_while, take_while1};
use combine::parser::repeat::escaped;
use combine::parser::Parser;
use combine::{
attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
any, attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
skip_many1, value,
};
use once_cell::sync::Lazy;
use regex::Regex;
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use crate::user_input_ast::Delimiter;
use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
@@ -133,16 +134,41 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
recognize((date, char('T'), time))
}
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
negative_number().or(phrase.or(word()))
fn escaped_character<'a>() -> impl Parser<&'a str, Output = char> {
(char('\\'), any()).map(|(_, x)| x)
}
fn escaped_string<'a>(delimiter: char) -> impl Parser<&'a str, Output = String> {
(
char(delimiter),
many(choice((
escaped_character(),
satisfy(move |c: char| c != delimiter),
))),
char(delimiter),
)
.map(|(_, s, _)| s)
}
fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
let double_quotes = escaped_string('"').map(|phrase| (Delimiter::DoubleQuotes, phrase));
let single_quotes = escaped_string('\'').map(|phrase| (Delimiter::SingleQuotes, phrase));
let text_no_delimiter = word().map(|text| (Delimiter::None, text));
negative_number()
.map(|negative_number_str| (Delimiter::None, negative_number_str))
.or(double_quotes)
.or(single_quotes)
.or(text_no_delimiter)
}
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
field_name: Some(field_name),
phrase,
slop,
(field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
UserInputLiteral {
field_name: Some(field_name),
phrase,
delimiter,
slop,
}
})
}
@@ -159,11 +185,13 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
}
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
field_name: None,
phrase,
slop,
});
let term_default_field =
(term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
field_name: None,
phrase,
delimiter,
slop,
});
attempt(term_query())
.or(term_default_field)
@@ -268,7 +296,11 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
/// Function that parses a set out of a Stream
/// Supports ranges like: `IN [val1 val2 val3]`
fn set<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_list = between(char('['), char(']'), sep_by(term_val(), spaces()));
let term_list = between(
char('['),
char(']'),
sep_by(term_val().map(|(_delimiter, text)| text), spaces()),
);
let set_content = ((string("IN"), spaces()), term_list).map(|(_, elements)| elements);
@@ -486,6 +518,7 @@ mod test {
assert_eq!(remaining, "");
}
#[track_caller]
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
let query = parse_to_ast().parse(query).unwrap().0;
let query_str = format!("{query:?}");
@@ -504,8 +537,9 @@ mod test {
#[test]
fn test_parse_query_to_ast_hyphen() {
test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\"");
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
}
#[test]
@@ -514,25 +548,25 @@ mod test {
format!("{:?}", parse_to_ast().parse("NOT")),
"Err(UnexpectedParse)"
);
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
test_parse_query_to_ast_helper("NOT a", "(-\"a\")");
test_parse_query_to_ast_helper("NOTa", "NOTa");
test_parse_query_to_ast_helper("NOT a", "(-a)");
}
#[test]
fn test_boosting() {
assert!(parse_to_ast().parse("a^2^3").is_err());
assert!(parse_to_ast().parse("a^2^").is_err());
test_parse_query_to_ast_helper("a^3", "(\"a\")^3");
test_parse_query_to_ast_helper("a^3 b^2", "(*(\"a\")^3 *(\"b\")^2)");
test_parse_query_to_ast_helper("a^1", "\"a\"");
test_parse_query_to_ast_helper("a^3", "(a)^3");
test_parse_query_to_ast_helper("a^3 b^2", "(*(a)^3 *(b)^2)");
test_parse_query_to_ast_helper("a^1", "a");
}
#[test]
fn test_parse_query_to_ast_binary_op() {
test_parse_query_to_ast_helper("a AND b", "(+\"a\" +\"b\")");
test_parse_query_to_ast_helper("a OR b", "(?\"a\" ?\"b\")");
test_parse_query_to_ast_helper("a OR b AND c", "(?\"a\" ?(+\"b\" +\"c\"))");
test_parse_query_to_ast_helper("a AND b AND c", "(+\"a\" +\"b\" +\"c\")");
test_parse_query_to_ast_helper("a AND b", "(+a +b)");
test_parse_query_to_ast_helper("a OR b", "(?a ?b)");
test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))");
test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)");
assert_eq!(
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
"Err(UnexpectedParse)"
@@ -574,7 +608,7 @@ mod test {
fn test_occur_leaf() {
let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap();
assert_eq!(occur, Some(Occur::Must));
assert_eq!(format!("{ast:?}"), "\"abc\"");
assert_eq!(format!("{ast:?}"), "abc");
}
#[test]
@@ -728,56 +762,62 @@ mod test {
#[test]
fn test_parse_query_to_triming_spaces() {
test_parse_query_to_ast_helper(" abc", "\"abc\"");
test_parse_query_to_ast_helper("abc ", "\"abc\"");
test_parse_query_to_ast_helper("( a OR abc)", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("a OR abc ", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc )", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc) ", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper(" abc", "abc");
test_parse_query_to_ast_helper("abc ", "abc");
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)");
test_parse_query_to_ast_helper("a OR abc ", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc )", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc) ", "(?a ?abc)");
}
#[test]
fn test_parse_query_single_term() {
test_parse_query_to_ast_helper("abc", "\"abc\"");
test_parse_query_to_ast_helper("abc", "abc");
}
#[test]
fn test_parse_query_default_clause() {
test_parse_query_to_ast_helper("a b", "(*\"a\" *\"b\")");
test_parse_query_to_ast_helper("a b", "(*a *b)");
}
#[test]
fn test_parse_query_must_default_clause() {
test_parse_query_to_ast_helper("+(a b)", "(*\"a\" *\"b\")");
test_parse_query_to_ast_helper("+(a b)", "(*a *b)");
}
#[test]
fn test_parse_query_must_single_term() {
test_parse_query_to_ast_helper("+d", "\"d\"");
test_parse_query_to_ast_helper("+d", "d");
}
#[test]
fn test_single_term_with_field() {
test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
test_parse_query_to_ast_helper("abc:toto", "\"abc\":toto");
}
#[test]
fn test_phrase_with_field() {
test_parse_query_to_ast_helper("abc:\"happy tax payer\"", "\"abc\":\"happy tax payer\"");
test_parse_query_to_ast_helper("abc:'happy tax payer'", "\"abc\":'happy tax payer'");
}
#[test]
fn test_single_term_with_float() {
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":1.1");
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":1.1");
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":1.1");
}
#[test]
fn test_must_clause() {
test_parse_query_to_ast_helper("(+a +b)", "(+\"a\" +\"b\")");
test_parse_query_to_ast_helper("(+a +b)", "(+a +b)");
}
#[test]
fn test_parse_test_query_plus_a_b_plus_d() {
test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")");
test_parse_query_to_ast_helper("+(a b) +d", "(+(*a *b) +d)");
}
#[test]
@@ -790,13 +830,13 @@ mod test {
#[test]
fn test_parse_test_query_other() {
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
test_parse_query_to_ast_helper("(+a +b) d", "(*(+a +b) *d)");
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":toto");
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":toto");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":toto -titi)");
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":toto)");
test_is_parse_err("--abc:toto");
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)");
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
}
@@ -821,11 +861,10 @@ mod test {
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *~4)");
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
test_parse_query_to_ast_helper("~Document", "~Document");
test_parse_query_to_ast_helper("a~2", "a~2");
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
@@ -835,7 +874,19 @@ mod test {
#[test]
fn test_not_queries_are_consistent() {
test_parse_query_to_ast_helper("tata -toto", "(*\"tata\" -\"toto\")");
test_parse_query_to_ast_helper("tata NOT toto", "(*\"tata\" -\"toto\")");
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
test_parse_query_to_ast_helper("tata NOT toto", "(*tata -toto)");
}
#[test]
fn test_escaping() {
test_parse_query_to_ast_helper(
r#"myfield:"hello\"happy\'tax""#,
r#""myfield":"hello"happy'tax""#,
);
test_parse_query_to_ast_helper(
r#"myfield:'hello\"happy\'tax'"#,
r#""myfield":'hello"happy'tax'"#,
);
}
}

View File

@@ -19,7 +19,7 @@ pub enum UserInputLeaf {
}
impl Debug for UserInputLeaf {
fn fmt(&self, formatter: &mut Formatter<'_>) -> Result<(), fmt::Error> {
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
match self {
UserInputLeaf::Literal(literal) => literal.fmt(formatter),
UserInputLeaf::Range {
@@ -40,11 +40,11 @@ impl Debug for UserInputLeaf {
write!(formatter, "\"{field}\": ")?;
}
write!(formatter, "IN [")?;
for (i, element) in elements.iter().enumerate() {
for (i, text) in elements.iter().enumerate() {
if i != 0 {
write!(formatter, " ")?;
}
write!(formatter, "\"{element}\"")?;
write!(formatter, "\"{text}\"")?;
}
write!(formatter, "]")
}
@@ -53,19 +53,37 @@ impl Debug for UserInputLeaf {
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum Delimiter {
SingleQuotes,
DoubleQuotes,
None,
}
#[derive(PartialEq)]
pub struct UserInputLiteral {
pub field_name: Option<String>,
pub phrase: String,
pub delimiter: Delimiter,
pub slop: u32,
}
impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if let Some(ref field) = self.field_name {
write!(formatter, "\"{field}\":")?;
}
write!(formatter, "\"{}\"", self.phrase)?;
match self.delimiter {
Delimiter::SingleQuotes => {
write!(formatter, "'{}'", self.phrase)?;
}
Delimiter::DoubleQuotes => {
write!(formatter, "\"{}\"", self.phrase)?;
}
Delimiter::None => {
write!(formatter, "{}", self.phrase)?;
}
}
if self.slop > 0 {
write!(formatter, "~{}", self.slop)?;
}