From 62709b8094123dff440764d8c2d039b7cc66ddf3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 19 May 2023 12:07:10 +0900 Subject: [PATCH] Change in the query grammar. (#2050) * Change in the query grammar. Quotation mark can now be used for phrase queries. The delimiter is part of the `UserInputLeaf`. That information is meant to be used in Quickwit to solve #3364. This PR also adds support for quotation marks escaping in phrase queries. * Apply suggestions from code review --- query-grammar/src/lib.rs | 4 +- query-grammar/src/query_grammar.rs | 163 ++++++++++++++++--------- query-grammar/src/user_input_ast.rs | 28 ++++- src/query/query_parser/query_parser.rs | 31 +++-- 4 files changed, 148 insertions(+), 78 deletions(-) diff --git a/query-grammar/src/lib.rs b/query-grammar/src/lib.rs index 9b1e4007e..9e8d0a419 100644 --- a/query-grammar/src/lib.rs +++ b/query-grammar/src/lib.rs @@ -7,7 +7,9 @@ use combine::parser::Parser; pub use crate::occur::Occur; use crate::query_grammar::parse_to_ast; -pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; +pub use crate::user_input_ast::{ + Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral, +}; pub struct Error; diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs index 6c8c2ae14..41c5b2cb9 100644 --- a/query-grammar/src/query_grammar.rs +++ b/query-grammar/src/query_grammar.rs @@ -5,13 +5,14 @@ use combine::parser::range::{take_while, take_while1}; use combine::parser::repeat::escaped; use combine::parser::Parser; use combine::{ - attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by, + any, attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by, skip_many1, value, }; use once_cell::sync::Lazy; use regex::Regex; use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; +use crate::user_input_ast::Delimiter; use crate::Occur; // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to @@ -133,16 +134,41 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> { recognize((date, char('T'), time)) } -fn term_val<'a>() -> impl Parser<&'a str, Output = String> { - let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"')); - negative_number().or(phrase.or(word())) +fn escaped_character<'a>() -> impl Parser<&'a str, Output = char> { + (char('\\'), any()).map(|(_, x)| x) +} + +fn escaped_string<'a>(delimiter: char) -> impl Parser<&'a str, Output = String> { + ( + char(delimiter), + many(choice(( + escaped_character(), + satisfy(move |c: char| c != delimiter), + ))), + char(delimiter), + ) + .map(|(_, s, _)| s) +} + +fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> { + let double_quotes = escaped_string('"').map(|phrase| (Delimiter::DoubleQuotes, phrase)); + let single_quotes = escaped_string('\'').map(|phrase| (Delimiter::SingleQuotes, phrase)); + let text_no_delimiter = word().map(|text| (Delimiter::None, text)); + negative_number() + .map(|negative_number_str| (Delimiter::None, negative_number_str)) + .or(double_quotes) + .or(single_quotes) + .or(text_no_delimiter) } fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> { - (field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral { - field_name: Some(field_name), - phrase, - slop, + (field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| { + UserInputLiteral { + field_name: Some(field_name), + phrase, + delimiter, + slop, + } }) } @@ -159,11 +185,13 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> { } fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { - let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral { - field_name: None, - phrase, - slop, - }); + let term_default_field = + (term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral { + field_name: None, + phrase, + delimiter, + slop, + }); attempt(term_query()) .or(term_default_field) @@ -268,7 +296,11 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { /// Function that parses a set out of a Stream /// Supports ranges like: `IN [val1 val2 val3]` fn set<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { - let term_list = between(char('['), char(']'), sep_by(term_val(), spaces())); + let term_list = between( + char('['), + char(']'), + sep_by(term_val().map(|(_delimiter, text)| text), spaces()), + ); let set_content = ((string("IN"), spaces()), term_list).map(|(_, elements)| elements); @@ -486,6 +518,7 @@ mod test { assert_eq!(remaining, ""); } + #[track_caller] fn test_parse_query_to_ast_helper(query: &str, expected: &str) { let query = parse_to_ast().parse(query).unwrap().0; let query_str = format!("{query:?}"); @@ -504,8 +537,9 @@ mod test { #[test] fn test_parse_query_to_ast_hyphen() { test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\""); - test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\""); - test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\""); + test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'"); + test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded"); + test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded"); } #[test] @@ -514,25 +548,25 @@ mod test { format!("{:?}", parse_to_ast().parse("NOT")), "Err(UnexpectedParse)" ); - test_parse_query_to_ast_helper("NOTa", "\"NOTa\""); - test_parse_query_to_ast_helper("NOT a", "(-\"a\")"); + test_parse_query_to_ast_helper("NOTa", "NOTa"); + test_parse_query_to_ast_helper("NOT a", "(-a)"); } #[test] fn test_boosting() { assert!(parse_to_ast().parse("a^2^3").is_err()); assert!(parse_to_ast().parse("a^2^").is_err()); - test_parse_query_to_ast_helper("a^3", "(\"a\")^3"); - test_parse_query_to_ast_helper("a^3 b^2", "(*(\"a\")^3 *(\"b\")^2)"); - test_parse_query_to_ast_helper("a^1", "\"a\""); + test_parse_query_to_ast_helper("a^3", "(a)^3"); + test_parse_query_to_ast_helper("a^3 b^2", "(*(a)^3 *(b)^2)"); + test_parse_query_to_ast_helper("a^1", "a"); } #[test] fn test_parse_query_to_ast_binary_op() { - test_parse_query_to_ast_helper("a AND b", "(+\"a\" +\"b\")"); - test_parse_query_to_ast_helper("a OR b", "(?\"a\" ?\"b\")"); - test_parse_query_to_ast_helper("a OR b AND c", "(?\"a\" ?(+\"b\" +\"c\"))"); - test_parse_query_to_ast_helper("a AND b AND c", "(+\"a\" +\"b\" +\"c\")"); + test_parse_query_to_ast_helper("a AND b", "(+a +b)"); + test_parse_query_to_ast_helper("a OR b", "(?a ?b)"); + test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))"); + test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)"); assert_eq!( format!("{:?}", parse_to_ast().parse("a OR b aaa")), "Err(UnexpectedParse)" @@ -574,7 +608,7 @@ mod test { fn test_occur_leaf() { let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap(); assert_eq!(occur, Some(Occur::Must)); - assert_eq!(format!("{ast:?}"), "\"abc\""); + assert_eq!(format!("{ast:?}"), "abc"); } #[test] @@ -728,56 +762,62 @@ mod test { #[test] fn test_parse_query_to_triming_spaces() { - test_parse_query_to_ast_helper(" abc", "\"abc\""); - test_parse_query_to_ast_helper("abc ", "\"abc\""); - test_parse_query_to_ast_helper("( a OR abc)", "(?\"a\" ?\"abc\")"); - test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")"); - test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")"); - test_parse_query_to_ast_helper("a OR abc ", "(?\"a\" ?\"abc\")"); - test_parse_query_to_ast_helper("(a OR abc )", "(?\"a\" ?\"abc\")"); - test_parse_query_to_ast_helper("(a OR abc) ", "(?\"a\" ?\"abc\")"); + test_parse_query_to_ast_helper(" abc", "abc"); + test_parse_query_to_ast_helper("abc ", "abc"); + test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)"); + test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)"); + test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)"); + test_parse_query_to_ast_helper("a OR abc ", "(?a ?abc)"); + test_parse_query_to_ast_helper("(a OR abc )", "(?a ?abc)"); + test_parse_query_to_ast_helper("(a OR abc) ", "(?a ?abc)"); } #[test] fn test_parse_query_single_term() { - test_parse_query_to_ast_helper("abc", "\"abc\""); + test_parse_query_to_ast_helper("abc", "abc"); } #[test] fn test_parse_query_default_clause() { - test_parse_query_to_ast_helper("a b", "(*\"a\" *\"b\")"); + test_parse_query_to_ast_helper("a b", "(*a *b)"); } #[test] fn test_parse_query_must_default_clause() { - test_parse_query_to_ast_helper("+(a b)", "(*\"a\" *\"b\")"); + test_parse_query_to_ast_helper("+(a b)", "(*a *b)"); } #[test] fn test_parse_query_must_single_term() { - test_parse_query_to_ast_helper("+d", "\"d\""); + test_parse_query_to_ast_helper("+d", "d"); } #[test] fn test_single_term_with_field() { - test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\""); + test_parse_query_to_ast_helper("abc:toto", "\"abc\":toto"); + } + + #[test] + fn test_phrase_with_field() { + test_parse_query_to_ast_helper("abc:\"happy tax payer\"", "\"abc\":\"happy tax payer\""); + test_parse_query_to_ast_helper("abc:'happy tax payer'", "\"abc\":'happy tax payer'"); } #[test] fn test_single_term_with_float() { - test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\""); - test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\""); - test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\""); + test_parse_query_to_ast_helper("abc:1.1", "\"abc\":1.1"); + test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":1.1"); + test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":1.1"); } #[test] fn test_must_clause() { - test_parse_query_to_ast_helper("(+a +b)", "(+\"a\" +\"b\")"); + test_parse_query_to_ast_helper("(+a +b)", "(+a +b)"); } #[test] fn test_parse_test_query_plus_a_b_plus_d() { - test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")"); + test_parse_query_to_ast_helper("+(a b) +d", "(+(*a *b) +d)"); } #[test] @@ -790,13 +830,13 @@ mod test { #[test] fn test_parse_test_query_other() { - test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")"); - test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\""); - test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\""); - test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")"); - test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")"); + test_parse_query_to_ast_helper("(+a +b) d", "(*(+a +b) *d)"); + test_parse_query_to_ast_helper("+abc:toto", "\"abc\":toto"); + test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":toto"); + test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":toto -titi)"); + test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":toto)"); test_is_parse_err("--abc:toto"); - test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")"); + test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)"); test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\""); test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]"); } @@ -821,11 +861,10 @@ mod test { assert!(parse_to_ast().parse("foo:\"a b\"~").is_err()); assert!(parse_to_ast().parse("\"a b\"~a").is_err()); assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err()); - - test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")"); + test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *~4)"); test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\""); - test_parse_query_to_ast_helper("~Document", "\"~Document\""); - test_parse_query_to_ast_helper("a~2", "\"a~2\""); + test_parse_query_to_ast_helper("~Document", "~Document"); + test_parse_query_to_ast_helper("a~2", "a~2"); test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\""); test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1"); test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3"); @@ -835,7 +874,19 @@ mod test { #[test] fn test_not_queries_are_consistent() { - test_parse_query_to_ast_helper("tata -toto", "(*\"tata\" -\"toto\")"); - test_parse_query_to_ast_helper("tata NOT toto", "(*\"tata\" -\"toto\")"); + test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)"); + test_parse_query_to_ast_helper("tata NOT toto", "(*tata -toto)"); + } + + #[test] + fn test_escaping() { + test_parse_query_to_ast_helper( + r#"myfield:"hello\"happy\'tax""#, + r#""myfield":"hello"happy'tax""#, + ); + test_parse_query_to_ast_helper( + r#"myfield:'hello\"happy\'tax'"#, + r#""myfield":'hello"happy'tax'"#, + ); } } diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs index a1f565fd1..ba4613a53 100644 --- a/query-grammar/src/user_input_ast.rs +++ b/query-grammar/src/user_input_ast.rs @@ -19,7 +19,7 @@ pub enum UserInputLeaf { } impl Debug for UserInputLeaf { - fn fmt(&self, formatter: &mut Formatter<'_>) -> Result<(), fmt::Error> { + fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> { match self { UserInputLeaf::Literal(literal) => literal.fmt(formatter), UserInputLeaf::Range { @@ -40,11 +40,11 @@ impl Debug for UserInputLeaf { write!(formatter, "\"{field}\": ")?; } write!(formatter, "IN [")?; - for (i, element) in elements.iter().enumerate() { + for (i, text) in elements.iter().enumerate() { if i != 0 { write!(formatter, " ")?; } - write!(formatter, "\"{element}\"")?; + write!(formatter, "\"{text}\"")?; } write!(formatter, "]") } @@ -53,19 +53,37 @@ impl Debug for UserInputLeaf { } } +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum Delimiter { + SingleQuotes, + DoubleQuotes, + None, +} + #[derive(PartialEq)] pub struct UserInputLiteral { pub field_name: Option, pub phrase: String, + pub delimiter: Delimiter, pub slop: u32, } impl fmt::Debug for UserInputLiteral { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { if let Some(ref field) = self.field_name { write!(formatter, "\"{field}\":")?; } - write!(formatter, "\"{}\"", self.phrase)?; + match self.delimiter { + Delimiter::SingleQuotes => { + write!(formatter, "'{}'", self.phrase)?; + } + Delimiter::DoubleQuotes => { + write!(formatter, "\"{}\"", self.phrase)?; + } + Delimiter::None => { + write!(formatter, "{}", self.phrase)?; + } + } if self.slop > 0 { write!(formatter, "~{}", self.slop)?; } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 70f88904e..e8f1c0f6e 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -129,24 +129,22 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// /// The language covered by the current parser is extremely simple. /// -/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using tantivy's -/// [`SimpleTokenizer`](crate::tokenizer::SimpleTokenizer), hence becoming `["barack", "obama"]`. -/// The terms are then searched within the default terms of the query parser. +/// * simple terms: "e.g.: `Barack Obama` will be seen as a sequence of two tokens Barack and Obama. +/// By default, the query parser will interpret this as a disjunction (see +/// `.set_conjunction_by_default()`) and will match all documents that contains either "Barack" or +/// "Obama" or both. Since we did not target a specific field, the query parser will look into the +/// so-called default fields (as set up in the constructor). /// -/// e.g. If `body` and `title` are default fields, our example terms are -/// `["title:barack", "body:barack", "title:obama", "body:obama"]`. +/// Assuming that the default fields are `body` and `title`, and the query parser is set with +/// conjunction as a default, our query will be interpreted as. +/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`. /// By default, all tokenized and indexed fields are default fields. /// -/// Multiple terms are handled as an `OR` : any document containing at least -/// one of the term will go through the scoring. -/// -/// This behavior is slower, but is not a bad idea if the user is sorting -/// by relevance : The user typically just scans through the first few -/// documents in order of decreasing relevance and will stop when the documents -/// are not relevant anymore. -/// -/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`. -/// +/// It is possible to explicitly target a field by prefixing the text by the `fieldname:`. +/// Note this only applies to the term directly following. +/// For instance, assuming the query parser is configured to use conjunction by default, +/// `body:Barack Obama` is not interpreted as `body:Barack AND body:Obama` but as +/// `body:Barack OR (body:Barack OR text:Obama)` . /// /// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is /// interpreted @@ -165,7 +163,8 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// /// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. e.g., /// `title:"Barack Obama"` will only find documents that have "barack" immediately followed by -/// "obama". +/// "obama". Single quotes can also be used. If the text to be searched contains quotation mark, +/// it is possible to escape them with a \. /// /// * range terms: Range searches can be done by specifying the start and end bound. These can be /// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains a