From 2649c8a7158e6b8c504aa6b43e5af98da8d2c420 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 28 Aug 2018 11:03:54 +0900 Subject: [PATCH] Issue/246 (#393) * Moving Range and All to Leaves * Parsing OR/AND * Simplify user input ast * AND and OR supported. Returning an error when mixing syntax Closes #246 * Added support for NOT * Updated changelog --- CHANGELOG.md | 1 + README.md | 4 +- src/query/occur.rs | 35 +++++ src/query/query_parser/query_grammar.rs | 157 ++++++++++++++++++++--- src/query/query_parser/query_parser.rs | 118 ++++++++--------- src/query/query_parser/user_input_ast.rs | 152 ++++++++++++++++++---- 6 files changed, 355 insertions(+), 112 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef956ac52..d2256923a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Tantivy 0.7 - Skip data for doc ids and positions (@fulmicoton), greatly improving performance - Tantivy error now rely on the failure crate (@drusellers) +- Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax Tantivy 0.6.1 diff --git a/README.md b/README.md index 57b637020..62638292f 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ Tantivy is, in fact, strongly inspired by Lucene's design. - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Tiny startup time (<10ms), perfect for command line tools - BM25 scoring (the same as lucene) -- Basic query language (`+michael +jackson`) -- Phrase queries search (\"michael jackson\"`) +- Natural query language `(michael AND jackson) OR "king of pop"` +- Phrase queries search (`"michael jackson"`) - Incremental indexing - Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop) - Mmap directory diff --git a/src/query/occur.rs b/src/query/occur.rs index 9bcf02bc2..1a9396de0 100644 --- a/src/query/occur.rs +++ b/src/query/occur.rs @@ -12,3 +12,38 @@ pub enum Occur { /// search. MustNot, } + +impl Occur { + /// Returns the one-char prefix symbol for this `Occur`. + /// - `Should` => '?', + /// - `Must` => '+' + /// - `Not` => '-' + pub fn to_char(&self) -> char { + match *self { + Occur::Should => '?', + Occur::Must => '+', + Occur::MustNot => '-', + } + } +} + +/// Compose two occur values. +pub fn compose_occur(left: Occur, right: Occur) -> Occur { + match left { + Occur::Should => right, + Occur::Must => { + if right == Occur::MustNot { + Occur::MustNot + } else { + Occur::Must + } + } + Occur::MustNot => { + if right == Occur::MustNot { + Occur::Must + } else { + Occur::MustNot + } + } + } +} \ No newline at end of file diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 352666e8a..557e38e24 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -1,6 +1,9 @@ use super::user_input_ast::*; use combine::char::*; use combine::*; +use combine::stream::StreamErrorFor; +use combine::error::StreamError; +use query::occur::Occur; use query::query_parser::user_input_ast::UserInputBound; parser! { @@ -17,18 +20,25 @@ parser! { fn word[I]()(I) -> String where [I: Stream] { many1(satisfy(|c: char| c.is_alphanumeric())) + .and_then(|s: String| { + match s.as_str() { + "OR" => Err(StreamErrorFor::::unexpected_static_message("OR")), + "AND" => Err(StreamErrorFor::::unexpected_static_message("AND")), + "NOT" => Err(StreamErrorFor::::unexpected_static_message("NOT")), + _ => Ok(s) + } + }) } } parser! { - fn literal[I]()(I) -> UserInputAST + fn literal[I]()(I) -> UserInputLeaf where [I: Stream] { let term_val = || { let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s); phrase.or(word()) }; - let term_val_with_field = negative_number().or(term_val()); let term_query = (field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral { @@ -41,7 +51,7 @@ parser! { }); try(term_query) .or(term_default_field) - .map(UserInputAST::from) + .map(UserInputLeaf::from) } } @@ -55,7 +65,14 @@ parser! { } parser! { - fn range[I]()(I) -> UserInputAST + fn spaces1[I]()(I) -> () + where [I: Stream] { + skip_many1(space()) + } +} + +parser! { + fn range[I]()(I) -> UserInputLeaf where [I: Stream] { let term_val = || { word().or(negative_number()).or(char('*').map(|_| "*".to_string())) @@ -77,7 +94,7 @@ parser! { string("TO"), spaces(), upper_bound, - ).map(|(field, lower, _, _, _, upper)| UserInputAST::Range { + ).map(|(field, lower, _, _, _, upper)| UserInputLeaf::Range { field, lower, upper @@ -88,13 +105,53 @@ parser! { parser! { fn leaf[I]()(I) -> UserInputAST where [I: Stream] { - (char('-'), leaf()) - .map(|(_, expr)| UserInputAST::Not(Box::new(expr))) - .or((char('+'), leaf()).map(|(_, expr)| UserInputAST::Must(Box::new(expr)))) + (char('-'), leaf()).map(|(_, expr)| expr.unary(Occur::MustNot) ) + .or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) )) .or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr)) - .or(char('*').map(|_| UserInputAST::All)) - .or(try(range())) - .or(literal()) + .or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) )) + .or(try( + (string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot))) + ) + .or( + try( + range() + .map(|leaf| UserInputAST::from(leaf)) + ) + ) + .or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf)))) + } +} + +enum BinaryOperand { + Or, And +} + +parser! { + fn binary_operand[I]()(I) -> BinaryOperand + where [I: Stream] { + (spaces1(), + ( + string("AND").map(|_| BinaryOperand::And) + .or(string("OR").map(|_| BinaryOperand::Or)) + ), + spaces1()).map(|(_, op,_)| op) + } +} + + +enum Element { + SingleEl(UserInputAST), + NormalDisjunctive(Vec>) +} + +impl Element { + pub fn into_dnf(self) -> Vec> { + match self { + Element::NormalDisjunctive(conjunctions) => + conjunctions, + Element::SingleEl(el) => + vec!(vec!(el)), + } } } @@ -102,14 +159,56 @@ parser! { pub fn parse_to_ast[I]()(I) -> UserInputAST where [I: Stream] { - sep_by(leaf(), spaces()) - .map(|subqueries: Vec| { - if subqueries.len() == 1 { - subqueries.into_iter().next().unwrap() - } else { - UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) - } - }) + ( + try( + chainl1( + leaf().map(Element::SingleEl), + binary_operand().map(|op: BinaryOperand| + move |left: Element, right: Element| { + let mut dnf = left.into_dnf(); + if let Element::SingleEl(el) = right { + match op { + BinaryOperand::And => { + if let Some(last) = dnf.last_mut() { + last.push(el); + } + } + BinaryOperand::Or => { + dnf.push(vec!(el)); + } + } + } else { + unreachable!("Please report.") + } + Element::NormalDisjunctive(dnf) + } + ) + ) + .map(|el| el.into_dnf()) + .map(|fnd| { + if fnd.len() == 1 { + UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe + } else { + let conjunctions = fnd + .into_iter() + .map(|conjunction| UserInputAST::and(conjunction)) + .collect(); + UserInputAST::or(conjunctions) + } + }) + ) + .or( + sep_by(leaf(), spaces()) + .map(|subqueries: Vec| { + if subqueries.len() == 1 { + subqueries.into_iter().next().unwrap() + } else { + UserInputAST::Clause(subqueries.into_iter().collect()) + } + }) + ) + ) + } } @@ -128,6 +227,26 @@ mod test { assert!(parse_to_ast().parse(query).is_err()); } + + #[test] + fn test_parse_query_to_ast_not_op() { + assert_eq!(format!("{:?}", parse_to_ast().parse("NOT")), "Err(UnexpectedParse)"); + test_parse_query_to_ast_helper("NOTa", "\"NOTa\""); + test_parse_query_to_ast_helper("NOT a", "-(\"a\")"); + } + + #[test] + fn test_parse_query_to_ast_binary_op() { + test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))"); + test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))"); + test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))"); + test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))"); + assert_eq!(format!("{:?}", parse_to_ast().parse("a OR b aaa")), "Err(UnexpectedParse)"); + assert_eq!(format!("{:?}", parse_to_ast().parse("a AND b aaa")), "Err(UnexpectedParse)"); + assert_eq!(format!("{:?}", parse_to_ast().parse("aaa a OR b ")), "Err(UnexpectedParse)"); + assert_eq!(format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), "Err(UnexpectedParse)"); + } + #[test] fn test_parse_query_to_ast() { test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))"); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index abe6b404f..f3a9f37c0 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -5,6 +5,7 @@ use core::Index; use query::AllQuery; use query::BooleanQuery; use query::Occur; +use query::occur::compose_occur; use query::PhraseQuery; use query::Query; use query::RangeQuery; @@ -79,12 +80,22 @@ impl From for QueryParserError { /// /// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`. /// +/// +/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is interpreted +/// as `(a AND b) OR c`. +/// +/// * In addition to the boolean operators, the `-`, `+` can help define. These operators +/// are sufficient to axpress all queries using boolean operators. For instance `x AND y OR z` can +/// be written (`(+x +y) z`). In addition, these operators can help define "required optional" +/// queries. `(+x y)` matches the same document set as simply `x`, but `y` will help refining the score. +/// /// * negative terms: By prepending a term by a `-`, a term can be excluded /// from the search. This is useful for disambiguating a query. /// e.g. `apple -fruit` /// /// * must terms: By prepending a term by a `+`, a term can be made required for the search. /// +/// /// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. /// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed /// by "obama". @@ -315,56 +326,27 @@ impl QueryParser { let default_occur = self.default_occur(); let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new(); for sub_query in sub_queries { - let (occur, sub_ast) = self.compute_logical_ast_with_occur(*sub_query)?; + let (occur, sub_ast) = self.compute_logical_ast_with_occur(sub_query)?; let new_occur = compose_occur(default_occur, occur); logical_sub_queries.push((new_occur, sub_ast)); } Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } - UserInputAST::Not(subquery) => { - let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; - Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries)) + UserInputAST::Unary(left_occur, subquery) => { + let (right_occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; + Ok((compose_occur(left_occur, right_occur), logical_sub_queries)) } - UserInputAST::Must(subquery) => { - let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; - Ok((compose_occur(Occur::Must, occur), logical_sub_queries)) - } - UserInputAST::Range { - field, - lower, - upper, - } => { - let fields = self.resolved_fields(&field)?; - let mut clauses = fields - .iter() - .map(|&field| { - let field_entry = self.schema.get_field_entry(field); - let value_type = field_entry.field_type().value_type(); - Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range { - field, - value_type, - lower: self.resolve_bound(field, &lower)?, - upper: self.resolve_bound(field, &upper)?, - }))) - }) - .collect::, QueryParserError>>()?; - let result_ast = if clauses.len() == 1 { - clauses.pop().unwrap() - } else { - LogicalAST::Clause( - clauses - .into_iter() - .map(|clause| (Occur::Should, clause)) - .collect(), - ) - }; + UserInputAST::Leaf(leaf) => { + let result_ast = self.compute_logical_ast_from_leaf(*leaf)?; Ok((Occur::Should, result_ast)) } - UserInputAST::All => Ok(( - Occur::Should, - LogicalAST::Leaf(Box::new(LogicalLiteral::All)), - )), - UserInputAST::Leaf(literal) => { + } + } + + + fn compute_logical_ast_from_leaf(&self, leaf: UserInputLeaf) -> Result { + match leaf { + UserInputLeaf::Literal(literal) => { let term_phrases: Vec<(Field, String)> = match literal.field_name { Some(ref field_name) => { let field = self.resolve_field_name(field_name)?; @@ -395,30 +377,40 @@ impl QueryParser { } else { LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) }; - Ok((Occur::Should, result_ast)) + Ok(result_ast) + } + UserInputLeaf::All => { + Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))) + } + UserInputLeaf::Range { field, lower, upper } => { + let fields = self.resolved_fields(&field)?; + let mut clauses = fields + .iter() + .map(|&field| { + let field_entry = self.schema.get_field_entry(field); + let value_type = field_entry.field_type().value_type(); + Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range { + field, + value_type, + lower: self.resolve_bound(field, &lower)?, + upper: self.resolve_bound(field, &upper)?, + }))) + }) + .collect::, QueryParserError>>()?; + let result_ast = if clauses.len() == 1 { + clauses.pop().unwrap() + } else { + LogicalAST::Clause( + clauses + .into_iter() + .map(|clause| (Occur::Should, clause)) + .collect(), + ) + }; + Ok(result_ast) } } - } -} -/// Compose two occur values. -fn compose_occur(left: Occur, right: Occur) -> Occur { - match left { - Occur::Should => right, - Occur::Must => { - if right == Occur::MustNot { - Occur::MustNot - } else { - Occur::Must - } - } - Occur::MustNot => { - if right == Occur::MustNot { - Occur::Must - } else { - Occur::MustNot - } - } } } diff --git a/src/query/query_parser/user_input_ast.rs b/src/query/query_parser/user_input_ast.rs index 96606915d..37adb94be 100644 --- a/src/query/query_parser/user_input_ast.rs +++ b/src/query/query_parser/user_input_ast.rs @@ -1,4 +1,41 @@ use std::fmt; +use std::fmt::{Debug, Formatter}; + +use query::Occur; + +pub enum UserInputLeaf { + Literal(UserInputLiteral), + All, + Range { + field: Option, + lower: UserInputBound, + upper: UserInputBound, + }, +} + +impl Debug for UserInputLeaf { + fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> { + match self { + UserInputLeaf::Literal(literal) => { + literal.fmt(formatter) + } + UserInputLeaf::Range { + ref field, + ref lower, + ref upper, + } => { + if let &Some(ref field) = field { + write!(formatter, "{}:", field)?; + } + lower.display_lower(formatter)?; + write!(formatter, " TO ")?; + upper.display_upper(formatter)?; + Ok(()) + } + UserInputLeaf::All => write!(formatter, "*"), + } + } +} pub struct UserInputLiteral { pub field_name: Option, @@ -43,28 +80,99 @@ impl UserInputBound { } pub enum UserInputAST { - Clause(Vec>), - Not(Box), - Must(Box), - Range { - field: Option, - lower: UserInputBound, - upper: UserInputBound, - }, - All, - Leaf(Box), + Clause(Vec), + Unary(Occur, Box), +// Not(Box), +// Should(Box), +// Must(Box), + Leaf(Box), } -impl From for UserInputAST { - fn from(literal: UserInputLiteral) -> UserInputAST { - UserInputAST::Leaf(Box::new(literal)) + +impl UserInputAST { + pub fn unary(self, occur: Occur) -> UserInputAST { + UserInputAST::Unary(occur, Box::new(self)) + } + + fn compose(occur: Occur, asts: Vec) -> UserInputAST { + assert!(occur != Occur::MustNot); + assert!(!asts.is_empty()); + if asts.len() == 1 { + asts.into_iter().next().unwrap() //< safe + } else { + UserInputAST::Clause(asts + .into_iter() + .map(|ast: UserInputAST| + ast.unary(occur) + ) + .collect::>() + ) + } + } + + pub fn and(asts: Vec) -> UserInputAST { + UserInputAST::compose(Occur::Must, asts) + } + + pub fn or(asts: Vec) -> UserInputAST { + UserInputAST::compose(Occur::Should, asts) + } + +} + + + +/* +impl UserInputAST { + + fn compose_occur(self, occur: Occur) -> UserInputAST { + match self { + UserInputAST::Not(other) => { + let new_occur = compose_occur(Occur::MustNot, occur); + other.simplify() + } + _ => { + self + } + } + } + + pub fn simplify(self) -> UserInputAST { + match self { + UserInputAST::Clause(els) => { + if els.len() == 1 { + return els.into_iter().next().unwrap(); + } else { + return self; + } + } + UserInputAST::Not(els) => { + if els.len() == 1 { + return els.into_iter().next().unwrap(); + } else { + return self; + } + } + } + } +} +*/ + +impl From for UserInputLeaf { + fn from(literal: UserInputLiteral) -> UserInputLeaf { + UserInputLeaf::Literal(literal) + } +} + +impl From for UserInputAST { + fn from(leaf: UserInputLeaf) -> UserInputAST { + UserInputAST::Leaf(Box::new(leaf)) } } impl fmt::Debug for UserInputAST { fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { - UserInputAST::Must(ref subquery) => write!(formatter, "+({:?})", subquery), UserInputAST::Clause(ref subqueries) => { if subqueries.is_empty() { write!(formatter, "")?; @@ -78,21 +186,9 @@ impl fmt::Debug for UserInputAST { } Ok(()) } - UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery), - UserInputAST::Range { - ref field, - ref lower, - ref upper, - } => { - if let &Some(ref field) = field { - write!(formatter, "{}:", field)?; - } - lower.display_lower(formatter)?; - write!(formatter, " TO ")?; - upper.display_upper(formatter)?; - Ok(()) + UserInputAST::Unary(ref occur, ref subquery) => { + write!(formatter, "{}({:?})", occur.to_char(), subquery) } - UserInputAST::All => write!(formatter, "*"), UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery), } }