From f428f344da427e8deb86d3e35b4f7e5bcc708624 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 8 Aug 2019 17:48:21 +0900 Subject: [PATCH] Various bugfix in the query parser (#619) --- CHANGELOG.md | 5 +- src/lib.rs | 2 +- src/query/query_parser/logical_ast.rs | 1 - src/query/query_parser/query_grammar.rs | 215 +++++++++++------------ src/query/query_parser/user_input_ast.rs | 45 +---- 5 files changed, 115 insertions(+), 153 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37841207d..cf482e3a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,10 @@ Tantivy 0.11.0 ===================== - Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima) -- Closes #609. Better handling of hyphens in query parser. +- Various bugfixes in the query parser. + - Better handling of hyphens in query parser. (#609) + - Better handling of whitespaces. + Tantivy 0.10.1 ===================== diff --git a/src/lib.rs b/src/lib.rs index 44683bf90..e8c70e482 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,9 @@ #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] +#![recursion_limit = "100"] #![cfg_attr(all(feature = "unstable", test), feature(test))] #![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] #![warn(missing_docs)] -#![recursion_limit = "80"] //! # `tantivy` //! diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index d3a25a1fd..299aa1241 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -18,7 +18,6 @@ pub enum LogicalLiteral { All, } -#[derive(Clone)] pub enum LogicalAST { Clause(Vec<(Occur, LogicalAST)>), Leaf(Box), diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 9d3348aa1..a38020957 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -1,4 +1,3 @@ -use super::query_grammar; use super::user_input_ast::*; use crate::query::occur::Occur; use crate::query::query_parser::user_input_ast::UserInputBound; @@ -13,7 +12,7 @@ parser! { ( letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')), - ).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) + ).skip(char(':')).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) } } @@ -21,18 +20,17 @@ parser! { fn word[I]()(I) -> String where [I: Stream] { ( - satisfy(|c: char| c.is_alphanumeric()), + satisfy(|c: char| !c.is_whitespace() && !['-', '`', ':', '{', '}', '"', '[', ']', '(',')'].contains(&c) ), many(satisfy(|c: char| !c.is_whitespace() && ![':', '{', '}', '"', '[', ']', '(',')'].contains(&c))) ) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) - .and_then(|s: String| { + .and_then(|s: String| match s.as_str() { "OR" => Err(StreamErrorFor::::unexpected_static_message("OR")), "AND" => Err(StreamErrorFor::::unexpected_static_message("AND")), "NOT" => Err(StreamErrorFor::::unexpected_static_message("NOT")), _ => Ok(s) - } - }) + }) } } @@ -41,12 +39,13 @@ parser! { where [I: Stream] { let term_val = || { - let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s); + let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"')); phrase.or(word()) }; let term_val_with_field = negative_number().or(term_val()); let term_query = - (field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral { + (field(), term_val_with_field) + .map(|(field_name, phrase)| UserInputLiteral { field_name: Some(field_name), phrase, }); @@ -64,8 +63,8 @@ parser! { fn negative_number[I]()(I) -> String where [I: Stream] { - (char('-'), many1(satisfy(char::is_numeric))) - .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) + (char('-'), many1(satisfy(char::is_numeric))) + .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) } } @@ -79,27 +78,23 @@ parser! { parser! { fn range[I]()(I) -> UserInputLeaf where [I: Stream] { - let term_val = || { - word().or(negative_number()).or(char('*').map(|_| "*".to_string())) - }; - let lower_bound = { - let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w)); - let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w)); - attempt(excl).or(incl) - }; - let upper_bound = { - let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w)); - let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w)); - attempt(excl).or(incl) + let range_term_val = || { + word().or(negative_number()).or(char('*').with(value("*".to_string()))) }; + let lower_bound = (one_of("{[".chars()), range_term_val()) + .map(|(boundary_char, lower_bound): (char, String)| + if boundary_char == '{' { UserInputBound::Exclusive(lower_bound) } + else { UserInputBound::Inclusive(lower_bound) }); + let upper_bound = (range_term_val(), one_of("}]".chars())) + .map(|(higher_bound, boundary_char): (String, char)| + if boundary_char == '}' { UserInputBound::Exclusive(higher_bound) } + else { UserInputBound::Inclusive(higher_bound) }); ( - optional((field(), char(':')).map(|x| x.0)), - lower_bound, - spaces(), - string("TO"), - spaces(), + optional(field()), + lower_bound + .skip((spaces(), string("TO"), spaces())), upper_bound, - ).map(|(field, lower, _, _, _, upper)| UserInputLeaf::Range { + ).map(|(field, lower, upper)| UserInputLeaf::Range { field, lower, upper @@ -107,23 +102,28 @@ parser! { } } +fn negate(expr: UserInputAST) -> UserInputAST { + expr.unary(Occur::MustNot) +} + +fn must(expr: UserInputAST) -> UserInputAST { + expr.unary(Occur::Must) +} + parser! { fn leaf[I]()(I) -> UserInputAST where [I: Stream] { - (char('-'), leaf()).map(|(_, expr)| expr.unary(Occur::MustNot) ) - .or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) )) - .or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr)) - .or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) )) - .or(attempt( - (string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot)) - ) - ) - .or(attempt( - range().map(UserInputAST::from))) - .or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf)))) + char('-').with(leaf()).map(negate) + .or(char('+').with(leaf()).map(must)) + .or(char('(').with(ast()).skip(char(')'))) + .or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All))) + .or(attempt(string("NOT").skip(spaces1()).with(leaf()).map(negate))) + .or(attempt(range().map(UserInputAST::from))) + .or(literal().map(UserInputAST::from)) } } +#[derive(Clone, Copy)] enum BinaryOperand { Or, And, @@ -131,27 +131,54 @@ enum BinaryOperand { parser! { fn binary_operand[I]()(I) -> BinaryOperand - where [I: Stream] { - (spaces1(), - ( - string("AND").map(|_| BinaryOperand::And) - .or(string("OR").map(|_| BinaryOperand::Or)) - ), - spaces1()).map(|(_, op,_)| op) + where [I: Stream] + { + string("AND").with(value(BinaryOperand::And)) + .or(string("OR").with(value(BinaryOperand::Or))) } } -enum Element { - SingleEl(UserInputAST), - NormalDisjunctive(Vec>), +fn aggregate_binary_expressions( + left: UserInputAST, + others: Vec<(BinaryOperand, UserInputAST)>, +) -> UserInputAST { + let mut dnf: Vec> = vec![vec![left]]; + for (operator, operand_ast) in others { + match operator { + BinaryOperand::And => { + if let Some(last) = dnf.last_mut() { + last.push(operand_ast); + } + } + BinaryOperand::Or => { + dnf.push(vec![operand_ast]); + } + } + } + if dnf.len() == 1 { + UserInputAST::and(dnf.into_iter().next().unwrap()) //< safe + } else { + let conjunctions = dnf.into_iter().map(UserInputAST::and).collect(); + UserInputAST::or(conjunctions) + } } -impl Element { - pub fn into_dnf(self) -> Vec> { - match self { - Element::NormalDisjunctive(conjunctions) => conjunctions, - Element::SingleEl(el) => vec![vec![el]], - } +parser! { + pub fn ast[I]()(I) -> UserInputAST + where [I: Stream] + { + let operand_leaf = (binary_operand().skip(spaces()), leaf().skip(spaces())); + let boolean_expr = (leaf().skip(spaces().silent()), many1(operand_leaf)).map( + |(left, right)| aggregate_binary_expressions(left,right)); + let whitespace_separated_leaves = many1(leaf().skip(spaces().silent())) + .map(|subqueries: Vec| + if subqueries.len() == 1 { + subqueries.into_iter().next().unwrap() + } else { + UserInputAST::Clause(subqueries.into_iter().collect()) + }); + let expr = attempt(boolean_expr).or(whitespace_separated_leaves); + spaces().with(expr).skip(spaces()) } } @@ -159,56 +186,7 @@ parser! { pub fn parse_to_ast[I]()(I) -> UserInputAST where [I: Stream] { - ( - attempt( - chainl1( - leaf().map(Element::SingleEl), - binary_operand().map(|op: BinaryOperand| - move |left: Element, right: Element| { - let mut dnf = left.into_dnf(); - if let Element::SingleEl(el) = right { - match op { - BinaryOperand::And => { - if let Some(last) = dnf.last_mut() { - last.push(el); - } - } - BinaryOperand::Or => { - dnf.push(vec!(el)); - } - } - } else { - unreachable!("Please report.") - } - Element::NormalDisjunctive(dnf) - } - ) - ) - .map(query_grammar::Element::into_dnf) - .map(|fnd| { - if fnd.len() == 1 { - UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe - } else { - let conjunctions = fnd - .into_iter() - .map(UserInputAST::and) - .collect(); - UserInputAST::or(conjunctions) - } - }) - ) - .or( - sep_by(leaf(), spaces()) - .map(|subqueries: Vec| { - if subqueries.len() == 1 { - subqueries.into_iter().next().unwrap() - } else { - UserInputAST::Clause(subqueries.into_iter().collect()) - } - }) - ) - ) - + spaces().with(optional(ast()).skip(eof())).map(|opt_ast| opt_ast.unwrap_or(UserInputAST::empty_query())) } } @@ -227,6 +205,11 @@ mod test { assert!(parse_to_ast().parse(query).is_err()); } + #[test] + fn test_parse_empty_to_ast() { + test_parse_query_to_ast_helper("", ""); + } + #[test] fn test_parse_query_to_ast_hyphen() { test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\""); @@ -268,8 +251,24 @@ mod test { ); } + #[test] + fn test_parse_query_to_triming_spaces() { + test_parse_query_to_ast_helper(" abc", "\"abc\""); + test_parse_query_to_ast_helper("abc ", "\"abc\""); + test_parse_query_to_ast_helper("( a OR abc)", "(?(\"a\") ?(\"abc\"))"); + test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))"); + test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))"); + test_parse_query_to_ast_helper("a OR abc ", "(?(\"a\") ?(\"abc\"))"); + test_parse_query_to_ast_helper("(a OR abc )", "(?(\"a\") ?(\"abc\"))"); + test_parse_query_to_ast_helper("(a OR abc) ", "(?(\"a\") ?(\"abc\"))"); + } + #[test] fn test_parse_query_to_ast() { + test_parse_query_to_ast_helper("abc", "\"abc\""); + test_parse_query_to_ast_helper("a b", "(\"a\" \"b\")"); + test_parse_query_to_ast_helper("+(a b)", "+((\"a\" \"b\"))"); + test_parse_query_to_ast_helper("+d", "+(\"d\")"); test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))"); test_parse_query_to_ast_helper("(+a +b) d", "((+(\"a\") +(\"b\")) \"d\")"); test_parse_query_to_ast_helper("(+a)", "+(\"a\")"); @@ -281,11 +280,6 @@ mod test { test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")"); test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")"); test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\""); - test_is_parse_err("abc + "); - } - - #[test] - fn test_parse_query_to_ast_range() { test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]"); test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]"); test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}"); @@ -293,5 +287,6 @@ mod test { test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}"); test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}"); test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}"); + test_is_parse_err("abc + "); } } diff --git a/src/query/query_parser/user_input_ast.rs b/src/query/query_parser/user_input_ast.rs index 649326e68..dc907ed2b 100644 --- a/src/query/query_parser/user_input_ast.rs +++ b/src/query/query_parser/user_input_ast.rs @@ -80,9 +80,6 @@ impl UserInputBound { pub enum UserInputAST { Clause(Vec), Unary(Occur, Box), - // Not(Box), - // Should(Box), - // Must(Box), Leaf(Box), } @@ -92,7 +89,7 @@ impl UserInputAST { } fn compose(occur: Occur, asts: Vec) -> UserInputAST { - assert!(occur != Occur::MustNot); + assert_ne!(occur, Occur::MustNot); assert!(!asts.is_empty()); if asts.len() == 1 { asts.into_iter().next().unwrap() //< safe @@ -105,6 +102,10 @@ impl UserInputAST { } } + pub fn empty_query() -> UserInputAST { + UserInputAST::Clause(Vec::default()) + } + pub fn and(asts: Vec) -> UserInputAST { UserInputAST::compose(Occur::Must, asts) } @@ -114,42 +115,6 @@ impl UserInputAST { } } -/* -impl UserInputAST { - - fn compose_occur(self, occur: Occur) -> UserInputAST { - match self { - UserInputAST::Not(other) => { - let new_occur = compose_occur(Occur::MustNot, occur); - other.simplify() - } - _ => { - self - } - } - } - - pub fn simplify(self) -> UserInputAST { - match self { - UserInputAST::Clause(els) => { - if els.len() == 1 { - return els.into_iter().next().unwrap(); - } else { - return self; - } - } - UserInputAST::Not(els) => { - if els.len() == 1 { - return els.into_iter().next().unwrap(); - } else { - return self; - } - } - } - } -} -*/ - impl From for UserInputLeaf { fn from(literal: UserInputLiteral) -> UserInputLeaf { UserInputLeaf::Literal(literal)