From 62709b8094123dff440764d8c2d039b7cc66ddf3 Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul@quickwit.io>
Date: Fri, 19 May 2023 12:07:10 +0900
Subject: [PATCH] Change in the query grammar. (#2050)

* Change in the query grammar.

Quotation mark can now be used for phrase queries.
The delimiter is part of the `UserInputLeaf`.
That information is meant to be used in Quickwit to solve #3364.

This PR also adds support for quotation marks escaping in phrase
queries.

* Apply suggestions from code review
---
 query-grammar/src/lib.rs               |   4 +-
 query-grammar/src/query_grammar.rs     | 163 ++++++++++++++++---------
 query-grammar/src/user_input_ast.rs    |  28 ++++-
 src/query/query_parser/query_parser.rs |  31 +++--
 4 files changed, 148 insertions(+), 78 deletions(-)

diff --git a/query-grammar/src/lib.rs b/query-grammar/src/lib.rs
index 9b1e4007e..9e8d0a419 100644
--- a/query-grammar/src/lib.rs
+++ b/query-grammar/src/lib.rs
@@ -7,7 +7,9 @@ use combine::parser::Parser;
 
 pub use crate::occur::Occur;
 use crate::query_grammar::parse_to_ast;
-pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
+pub use crate::user_input_ast::{
+    Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
+};
 
 pub struct Error;
 
diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs
index 6c8c2ae14..41c5b2cb9 100644
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -5,13 +5,14 @@ use combine::parser::range::{take_while, take_while1};
 use combine::parser::repeat::escaped;
 use combine::parser::Parser;
 use combine::{
-    attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
+    any, attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
     skip_many1, value,
 };
 use once_cell::sync::Lazy;
 use regex::Regex;
 
 use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
+use crate::user_input_ast::Delimiter;
 use crate::Occur;
 
 // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
@@ -133,16 +134,41 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
     recognize((date, char('T'), time))
 }
 
-fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
-    let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
-    negative_number().or(phrase.or(word()))
+fn escaped_character<'a>() -> impl Parser<&'a str, Output = char> {
+    (char('\\'), any()).map(|(_, x)| x)
+}
+
+fn escaped_string<'a>(delimiter: char) -> impl Parser<&'a str, Output = String> {
+    (
+        char(delimiter),
+        many(choice((
+            escaped_character(),
+            satisfy(move |c: char| c != delimiter),
+        ))),
+        char(delimiter),
+    )
+        .map(|(_, s, _)| s)
+}
+
+fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
+    let double_quotes = escaped_string('"').map(|phrase| (Delimiter::DoubleQuotes, phrase));
+    let single_quotes = escaped_string('\'').map(|phrase| (Delimiter::SingleQuotes, phrase));
+    let text_no_delimiter = word().map(|text| (Delimiter::None, text));
+    negative_number()
+        .map(|negative_number_str| (Delimiter::None, negative_number_str))
+        .or(double_quotes)
+        .or(single_quotes)
+        .or(text_no_delimiter)
 }
 
 fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
-    (field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
-        field_name: Some(field_name),
-        phrase,
-        slop,
+    (field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
+        UserInputLiteral {
+            field_name: Some(field_name),
+            phrase,
+            delimiter,
+            slop,
+        }
     })
 }
 
@@ -159,11 +185,13 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
 }
 
 fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
-    let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
-        field_name: None,
-        phrase,
-        slop,
-    });
+    let term_default_field =
+        (term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
+            field_name: None,
+            phrase,
+            delimiter,
+            slop,
+        });
 
     attempt(term_query())
         .or(term_default_field)
@@ -268,7 +296,11 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
 /// Function that parses a set out of a Stream
 /// Supports ranges like: `IN [val1 val2 val3]`
 fn set<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
-    let term_list = between(char('['), char(']'), sep_by(term_val(), spaces()));
+    let term_list = between(
+        char('['),
+        char(']'),
+        sep_by(term_val().map(|(_delimiter, text)| text), spaces()),
+    );
 
     let set_content = ((string("IN"), spaces()), term_list).map(|(_, elements)| elements);
 
@@ -486,6 +518,7 @@ mod test {
         assert_eq!(remaining, "");
     }
 
+    #[track_caller]
     fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
         let query = parse_to_ast().parse(query).unwrap().0;
         let query_str = format!("{query:?}");
@@ -504,8 +537,9 @@ mod test {
     #[test]
     fn test_parse_query_to_ast_hyphen() {
         test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\"");
-        test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
-        test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
+        test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'");
+        test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
+        test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
     }
 
     #[test]
@@ -514,25 +548,25 @@ mod test {
             format!("{:?}", parse_to_ast().parse("NOT")),
             "Err(UnexpectedParse)"
         );
-        test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
-        test_parse_query_to_ast_helper("NOT a", "(-\"a\")");
+        test_parse_query_to_ast_helper("NOTa", "NOTa");
+        test_parse_query_to_ast_helper("NOT a", "(-a)");
     }
 
     #[test]
     fn test_boosting() {
         assert!(parse_to_ast().parse("a^2^3").is_err());
         assert!(parse_to_ast().parse("a^2^").is_err());
-        test_parse_query_to_ast_helper("a^3", "(\"a\")^3");
-        test_parse_query_to_ast_helper("a^3 b^2", "(*(\"a\")^3 *(\"b\")^2)");
-        test_parse_query_to_ast_helper("a^1", "\"a\"");
+        test_parse_query_to_ast_helper("a^3", "(a)^3");
+        test_parse_query_to_ast_helper("a^3 b^2", "(*(a)^3 *(b)^2)");
+        test_parse_query_to_ast_helper("a^1", "a");
     }
 
     #[test]
     fn test_parse_query_to_ast_binary_op() {
-        test_parse_query_to_ast_helper("a AND b", "(+\"a\" +\"b\")");
-        test_parse_query_to_ast_helper("a OR b", "(?\"a\" ?\"b\")");
-        test_parse_query_to_ast_helper("a OR b AND c", "(?\"a\" ?(+\"b\" +\"c\"))");
-        test_parse_query_to_ast_helper("a AND b         AND c", "(+\"a\" +\"b\" +\"c\")");
+        test_parse_query_to_ast_helper("a AND b", "(+a +b)");
+        test_parse_query_to_ast_helper("a OR b", "(?a ?b)");
+        test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))");
+        test_parse_query_to_ast_helper("a AND b         AND c", "(+a +b +c)");
         assert_eq!(
             format!("{:?}", parse_to_ast().parse("a OR b aaa")),
             "Err(UnexpectedParse)"
@@ -574,7 +608,7 @@ mod test {
     fn test_occur_leaf() {
         let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap();
         assert_eq!(occur, Some(Occur::Must));
-        assert_eq!(format!("{ast:?}"), "\"abc\"");
+        assert_eq!(format!("{ast:?}"), "abc");
     }
 
     #[test]
@@ -728,56 +762,62 @@ mod test {
 
     #[test]
     fn test_parse_query_to_triming_spaces() {
-        test_parse_query_to_ast_helper("   abc", "\"abc\"");
-        test_parse_query_to_ast_helper("abc ", "\"abc\"");
-        test_parse_query_to_ast_helper("(  a OR abc)", "(?\"a\" ?\"abc\")");
-        test_parse_query_to_ast_helper("(a  OR abc)", "(?\"a\" ?\"abc\")");
-        test_parse_query_to_ast_helper("(a OR  abc)", "(?\"a\" ?\"abc\")");
-        test_parse_query_to_ast_helper("a OR abc ", "(?\"a\" ?\"abc\")");
-        test_parse_query_to_ast_helper("(a OR abc )", "(?\"a\" ?\"abc\")");
-        test_parse_query_to_ast_helper("(a OR  abc) ", "(?\"a\" ?\"abc\")");
+        test_parse_query_to_ast_helper("   abc", "abc");
+        test_parse_query_to_ast_helper("abc ", "abc");
+        test_parse_query_to_ast_helper("(  a OR abc)", "(?a ?abc)");
+        test_parse_query_to_ast_helper("(a  OR abc)", "(?a ?abc)");
+        test_parse_query_to_ast_helper("(a OR  abc)", "(?a ?abc)");
+        test_parse_query_to_ast_helper("a OR abc ", "(?a ?abc)");
+        test_parse_query_to_ast_helper("(a OR abc )", "(?a ?abc)");
+        test_parse_query_to_ast_helper("(a OR  abc) ", "(?a ?abc)");
     }
 
     #[test]
     fn test_parse_query_single_term() {
-        test_parse_query_to_ast_helper("abc", "\"abc\"");
+        test_parse_query_to_ast_helper("abc", "abc");
     }
 
     #[test]
     fn test_parse_query_default_clause() {
-        test_parse_query_to_ast_helper("a b", "(*\"a\" *\"b\")");
+        test_parse_query_to_ast_helper("a b", "(*a *b)");
     }
 
     #[test]
     fn test_parse_query_must_default_clause() {
-        test_parse_query_to_ast_helper("+(a b)", "(*\"a\" *\"b\")");
+        test_parse_query_to_ast_helper("+(a b)", "(*a *b)");
     }
 
     #[test]
     fn test_parse_query_must_single_term() {
-        test_parse_query_to_ast_helper("+d", "\"d\"");
+        test_parse_query_to_ast_helper("+d", "d");
     }
 
     #[test]
     fn test_single_term_with_field() {
-        test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
+        test_parse_query_to_ast_helper("abc:toto", "\"abc\":toto");
+    }
+
+    #[test]
+    fn test_phrase_with_field() {
+        test_parse_query_to_ast_helper("abc:\"happy tax payer\"", "\"abc\":\"happy tax payer\"");
+        test_parse_query_to_ast_helper("abc:'happy tax payer'", "\"abc\":'happy tax payer'");
     }
 
     #[test]
     fn test_single_term_with_float() {
-        test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
-        test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
-        test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
+        test_parse_query_to_ast_helper("abc:1.1", "\"abc\":1.1");
+        test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":1.1");
+        test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":1.1");
     }
 
     #[test]
     fn test_must_clause() {
-        test_parse_query_to_ast_helper("(+a +b)", "(+\"a\" +\"b\")");
+        test_parse_query_to_ast_helper("(+a +b)", "(+a +b)");
     }
 
     #[test]
     fn test_parse_test_query_plus_a_b_plus_d() {
-        test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")");
+        test_parse_query_to_ast_helper("+(a b) +d", "(+(*a *b) +d)");
     }
 
     #[test]
@@ -790,13 +830,13 @@ mod test {
 
     #[test]
     fn test_parse_test_query_other() {
-        test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
-        test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
-        test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
-        test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
-        test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
+        test_parse_query_to_ast_helper("(+a +b) d", "(*(+a +b) *d)");
+        test_parse_query_to_ast_helper("+abc:toto", "\"abc\":toto");
+        test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":toto");
+        test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":toto -titi)");
+        test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":toto)");
         test_is_parse_err("--abc:toto");
-        test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
+        test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)");
         test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
         test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
     }
@@ -821,11 +861,10 @@ mod test {
         assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
         assert!(parse_to_ast().parse("\"a b\"~a").is_err());
         assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
-
-        test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
+        test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *~4)");
         test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
-        test_parse_query_to_ast_helper("~Document", "\"~Document\"");
-        test_parse_query_to_ast_helper("a~2", "\"a~2\"");
+        test_parse_query_to_ast_helper("~Document", "~Document");
+        test_parse_query_to_ast_helper("a~2", "a~2");
         test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
         test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
         test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
@@ -835,7 +874,19 @@ mod test {
 
     #[test]
     fn test_not_queries_are_consistent() {
-        test_parse_query_to_ast_helper("tata -toto", "(*\"tata\" -\"toto\")");
-        test_parse_query_to_ast_helper("tata NOT toto", "(*\"tata\" -\"toto\")");
+        test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
+        test_parse_query_to_ast_helper("tata NOT toto", "(*tata -toto)");
+    }
+
+    #[test]
+    fn test_escaping() {
+        test_parse_query_to_ast_helper(
+            r#"myfield:"hello\"happy\'tax""#,
+            r#""myfield":"hello"happy'tax""#,
+        );
+        test_parse_query_to_ast_helper(
+            r#"myfield:'hello\"happy\'tax'"#,
+            r#""myfield":'hello"happy'tax'"#,
+        );
     }
 }
diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs
index a1f565fd1..ba4613a53 100644
--- a/query-grammar/src/user_input_ast.rs
+++ b/query-grammar/src/user_input_ast.rs
@@ -19,7 +19,7 @@ pub enum UserInputLeaf {
 }
 
 impl Debug for UserInputLeaf {
-    fn fmt(&self, formatter: &mut Formatter<'_>) -> Result<(), fmt::Error> {
+    fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
         match self {
             UserInputLeaf::Literal(literal) => literal.fmt(formatter),
             UserInputLeaf::Range {
@@ -40,11 +40,11 @@ impl Debug for UserInputLeaf {
                     write!(formatter, "\"{field}\": ")?;
                 }
                 write!(formatter, "IN [")?;
-                for (i, element) in elements.iter().enumerate() {
+                for (i, text) in elements.iter().enumerate() {
                     if i != 0 {
                         write!(formatter, " ")?;
                     }
-                    write!(formatter, "\"{element}\"")?;
+                    write!(formatter, "\"{text}\"")?;
                 }
                 write!(formatter, "]")
             }
@@ -53,19 +53,37 @@ impl Debug for UserInputLeaf {
     }
 }
 
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub enum Delimiter {
+    SingleQuotes,
+    DoubleQuotes,
+    None,
+}
+
 #[derive(PartialEq)]
 pub struct UserInputLiteral {
     pub field_name: Option<String>,
     pub phrase: String,
+    pub delimiter: Delimiter,
     pub slop: u32,
 }
 
 impl fmt::Debug for UserInputLiteral {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
         if let Some(ref field) = self.field_name {
             write!(formatter, "\"{field}\":")?;
         }
-        write!(formatter, "\"{}\"", self.phrase)?;
+        match self.delimiter {
+            Delimiter::SingleQuotes => {
+                write!(formatter, "'{}'", self.phrase)?;
+            }
+            Delimiter::DoubleQuotes => {
+                write!(formatter, "\"{}\"", self.phrase)?;
+            }
+            Delimiter::None => {
+                write!(formatter, "{}", self.phrase)?;
+            }
+        }
         if self.slop > 0 {
             write!(formatter, "~{}", self.slop)?;
         }
diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs
index 70f88904e..e8f1c0f6e 100644
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -129,24 +129,22 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
 ///
 /// The language covered by the current parser is extremely simple.
 ///
-/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using tantivy's
-///   [`SimpleTokenizer`](crate::tokenizer::SimpleTokenizer), hence becoming `["barack", "obama"]`.
-///   The terms are then searched within the default terms of the query parser.
+/// * simple terms: "e.g.: `Barack Obama` will be seen as a sequence of two tokens Barack and Obama.
+///   By default, the query parser will interpret this as a disjunction (see
+///   `.set_conjunction_by_default()`) and will match all documents that contains either "Barack" or
+///   "Obama" or both. Since we did not target a specific field, the query parser will look into the
+///   so-called default fields (as set up in the constructor).
 ///
-///   e.g. If `body` and `title` are default fields, our example terms are
-///   `["title:barack", "body:barack", "title:obama", "body:obama"]`.
+///   Assuming that the default fields are `body` and `title`, and the query parser is set with
+/// conjunction   as a default, our query will be interpreted as.
+///   `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
 ///   By default, all tokenized and indexed fields are default fields.
 ///
-///   Multiple terms are handled as an `OR` : any document containing at least
-///   one of the term will go through the scoring.
-///
-///   This behavior is slower, but is not a bad idea if the user is sorting
-///   by relevance : The user typically just scans through the first few
-///   documents in order of decreasing relevance and will stop when the documents
-///   are not relevant anymore.
-///
-///   Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
-///
+///   It is possible to explicitly target a field by prefixing the text by the `fieldname:`.
+///   Note this only applies to the term directly following.
+///   For instance, assuming the query parser is configured to use conjunction by default,
+///   `body:Barack Obama` is not interpreted as `body:Barack AND body:Obama` but as
+///   `body:Barack OR (body:Barack OR text:Obama)` .
 ///
 /// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
 ///   interpreted
@@ -165,7 +163,8 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
 ///
 /// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. e.g.,
 ///   `title:"Barack Obama"` will only find documents that have "barack" immediately followed by
-///   "obama".
+///   "obama". Single quotes can also be used. If the text to be searched contains quotation mark,
+///   it is possible to escape them with a \.
 ///
 /// * range terms: Range searches can be done by specifying the start and end bound. These can be
 ///   inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains a