Issue/246 (#393)

* Moving Range and All to Leaves

* Parsing OR/AND

* Simplify user input ast

* AND and OR supported. Returning an error when mixing syntax

Closes #246

* Added support for NOT

* Updated changelog
This commit is contained in:
Paul Masurel
2018-08-28 11:03:54 +09:00
committed by GitHub
parent ede97eded6
commit 2649c8a715
6 changed files with 355 additions and 112 deletions

View File

@@ -4,6 +4,7 @@ Tantivy 0.7
- Skip data for doc ids and positions (@fulmicoton),
greatly improving performance
- Tantivy error now rely on the failure crate (@drusellers)
- Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax
Tantivy 0.6.1

View File

@@ -32,8 +32,8 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command line tools
- BM25 scoring (the same as lucene)
- Basic query language (`+michael +jackson`)
- Phrase queries search (\"michael jackson\"`)
- Natural query language `(michael AND jackson) OR "king of pop"`
- Phrase queries search (`"michael jackson"`)
- Incremental indexing
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- Mmap directory

View File

@@ -12,3 +12,38 @@ pub enum Occur {
/// search.
MustNot,
}
impl Occur {
/// Returns the one-char prefix symbol for this `Occur`.
/// - `Should` => '?',
/// - `Must` => '+'
/// - `Not` => '-'
pub fn to_char(&self) -> char {
match *self {
Occur::Should => '?',
Occur::Must => '+',
Occur::MustNot => '-',
}
}
}
/// Compose two occur values.
pub fn compose_occur(left: Occur, right: Occur) -> Occur {
match left {
Occur::Should => right,
Occur::Must => {
if right == Occur::MustNot {
Occur::MustNot
} else {
Occur::Must
}
}
Occur::MustNot => {
if right == Occur::MustNot {
Occur::Must
} else {
Occur::MustNot
}
}
}
}

View File

@@ -1,6 +1,9 @@
use super::user_input_ast::*;
use combine::char::*;
use combine::*;
use combine::stream::StreamErrorFor;
use combine::error::StreamError;
use query::occur::Occur;
use query::query_parser::user_input_ast::UserInputBound;
parser! {
@@ -17,18 +20,25 @@ parser! {
fn word[I]()(I) -> String
where [I: Stream<Item = char>] {
many1(satisfy(|c: char| c.is_alphanumeric()))
.and_then(|s: String| {
match s.as_str() {
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
"AND" => Err(StreamErrorFor::<I>::unexpected_static_message("AND")),
"NOT" => Err(StreamErrorFor::<I>::unexpected_static_message("NOT")),
_ => Ok(s)
}
})
}
}
parser! {
fn literal[I]()(I) -> UserInputAST
fn literal[I]()(I) -> UserInputLeaf
where [I: Stream<Item = char>]
{
let term_val = || {
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
phrase.or(word())
};
let term_val_with_field = negative_number().or(term_val());
let term_query =
(field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
@@ -41,7 +51,7 @@ parser! {
});
try(term_query)
.or(term_default_field)
.map(UserInputAST::from)
.map(UserInputLeaf::from)
}
}
@@ -55,7 +65,14 @@ parser! {
}
parser! {
fn range[I]()(I) -> UserInputAST
fn spaces1[I]()(I) -> ()
where [I: Stream<Item = char>] {
skip_many1(space())
}
}
parser! {
fn range[I]()(I) -> UserInputLeaf
where [I: Stream<Item = char>] {
let term_val = || {
word().or(negative_number()).or(char('*').map(|_| "*".to_string()))
@@ -77,7 +94,7 @@ parser! {
string("TO"),
spaces(),
upper_bound,
).map(|(field, lower, _, _, _, upper)| UserInputAST::Range {
).map(|(field, lower, _, _, _, upper)| UserInputLeaf::Range {
field,
lower,
upper
@@ -88,13 +105,53 @@ parser! {
parser! {
fn leaf[I]()(I) -> UserInputAST
where [I: Stream<Item = char>] {
(char('-'), leaf())
.map(|(_, expr)| UserInputAST::Not(Box::new(expr)))
.or((char('+'), leaf()).map(|(_, expr)| UserInputAST::Must(Box::new(expr))))
(char('-'), leaf()).map(|(_, expr)| expr.unary(Occur::MustNot) )
.or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) ))
.or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr))
.or(char('*').map(|_| UserInputAST::All))
.or(try(range()))
.or(literal())
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) ))
.or(try(
(string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot)))
)
.or(
try(
range()
.map(|leaf| UserInputAST::from(leaf))
)
)
.or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf))))
}
}
enum BinaryOperand {
Or, And
}
parser! {
fn binary_operand[I]()(I) -> BinaryOperand
where [I: Stream<Item = char>] {
(spaces1(),
(
string("AND").map(|_| BinaryOperand::And)
.or(string("OR").map(|_| BinaryOperand::Or))
),
spaces1()).map(|(_, op,_)| op)
}
}
enum Element {
SingleEl(UserInputAST),
NormalDisjunctive(Vec<Vec<UserInputAST>>)
}
impl Element {
pub fn into_dnf(self) -> Vec<Vec<UserInputAST>> {
match self {
Element::NormalDisjunctive(conjunctions) =>
conjunctions,
Element::SingleEl(el) =>
vec!(vec!(el)),
}
}
}
@@ -102,14 +159,56 @@ parser! {
pub fn parse_to_ast[I]()(I) -> UserInputAST
where [I: Stream<Item = char>]
{
sep_by(leaf(), spaces())
.map(|subqueries: Vec<UserInputAST>| {
if subqueries.len() == 1 {
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
}
})
(
try(
chainl1(
leaf().map(Element::SingleEl),
binary_operand().map(|op: BinaryOperand|
move |left: Element, right: Element| {
let mut dnf = left.into_dnf();
if let Element::SingleEl(el) = right {
match op {
BinaryOperand::And => {
if let Some(last) = dnf.last_mut() {
last.push(el);
}
}
BinaryOperand::Or => {
dnf.push(vec!(el));
}
}
} else {
unreachable!("Please report.")
}
Element::NormalDisjunctive(dnf)
}
)
)
.map(|el| el.into_dnf())
.map(|fnd| {
if fnd.len() == 1 {
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
} else {
let conjunctions = fnd
.into_iter()
.map(|conjunction| UserInputAST::and(conjunction))
.collect();
UserInputAST::or(conjunctions)
}
})
)
.or(
sep_by(leaf(), spaces())
.map(|subqueries: Vec<UserInputAST>| {
if subqueries.len() == 1 {
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().collect())
}
})
)
)
}
}
@@ -128,6 +227,26 @@ mod test {
assert!(parse_to_ast().parse(query).is_err());
}
#[test]
fn test_parse_query_to_ast_not_op() {
assert_eq!(format!("{:?}", parse_to_ast().parse("NOT")), "Err(UnexpectedParse)");
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
test_parse_query_to_ast_helper("NOT a", "-(\"a\")");
}
#[test]
fn test_parse_query_to_ast_binary_op() {
test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))");
test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))");
test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))");
test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))");
assert_eq!(format!("{:?}", parse_to_ast().parse("a OR b aaa")), "Err(UnexpectedParse)");
assert_eq!(format!("{:?}", parse_to_ast().parse("a AND b aaa")), "Err(UnexpectedParse)");
assert_eq!(format!("{:?}", parse_to_ast().parse("aaa a OR b ")), "Err(UnexpectedParse)");
assert_eq!(format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), "Err(UnexpectedParse)");
}
#[test]
fn test_parse_query_to_ast() {
test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))");

View File

@@ -5,6 +5,7 @@ use core::Index;
use query::AllQuery;
use query::BooleanQuery;
use query::Occur;
use query::occur::compose_occur;
use query::PhraseQuery;
use query::Query;
use query::RangeQuery;
@@ -79,12 +80,22 @@ impl From<ParseIntError> for QueryParserError {
///
/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
///
///
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is interpreted
/// as `(a AND b) OR c`.
///
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators
/// are sufficient to axpress all queries using boolean operators. For instance `x AND y OR z` can
/// be written (`(+x +y) z`). In addition, these operators can help define "required optional"
/// queries. `(+x y)` matches the same document set as simply `x`, but `y` will help refining the score.
///
/// * negative terms: By prepending a term by a `-`, a term can be excluded
/// from the search. This is useful for disambiguating a query.
/// e.g. `apple -fruit`
///
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
///
///
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed.
/// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed
/// by "obama".
@@ -315,56 +326,27 @@ impl QueryParser {
let default_occur = self.default_occur();
let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new();
for sub_query in sub_queries {
let (occur, sub_ast) = self.compute_logical_ast_with_occur(*sub_query)?;
let (occur, sub_ast) = self.compute_logical_ast_with_occur(sub_query)?;
let new_occur = compose_occur(default_occur, occur);
logical_sub_queries.push((new_occur, sub_ast));
}
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Not(subquery) => {
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
UserInputAST::Unary(left_occur, subquery) => {
let (right_occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(left_occur, right_occur), logical_sub_queries))
}
UserInputAST::Must(subquery) => {
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
}
UserInputAST::Range {
field,
lower,
upper,
} => {
let fields = self.resolved_fields(&field)?;
let mut clauses = fields
.iter()
.map(|&field| {
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
field,
value_type,
lower: self.resolve_bound(field, &lower)?,
upper: self.resolve_bound(field, &upper)?,
})))
})
.collect::<Result<Vec<_>, QueryParserError>>()?;
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
LogicalAST::Clause(
clauses
.into_iter()
.map(|clause| (Occur::Should, clause))
.collect(),
)
};
UserInputAST::Leaf(leaf) => {
let result_ast = self.compute_logical_ast_from_leaf(*leaf)?;
Ok((Occur::Should, result_ast))
}
UserInputAST::All => Ok((
Occur::Should,
LogicalAST::Leaf(Box::new(LogicalLiteral::All)),
)),
UserInputAST::Leaf(literal) => {
}
}
fn compute_logical_ast_from_leaf(&self, leaf: UserInputLeaf) -> Result<LogicalAST, QueryParserError> {
match leaf {
UserInputLeaf::Literal(literal) => {
let term_phrases: Vec<(Field, String)> = match literal.field_name {
Some(ref field_name) => {
let field = self.resolve_field_name(field_name)?;
@@ -395,30 +377,40 @@ impl QueryParser {
} else {
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
Ok((Occur::Should, result_ast))
Ok(result_ast)
}
UserInputLeaf::All => {
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All)))
}
UserInputLeaf::Range { field, lower, upper } => {
let fields = self.resolved_fields(&field)?;
let mut clauses = fields
.iter()
.map(|&field| {
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
field,
value_type,
lower: self.resolve_bound(field, &lower)?,
upper: self.resolve_bound(field, &upper)?,
})))
})
.collect::<Result<Vec<_>, QueryParserError>>()?;
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
LogicalAST::Clause(
clauses
.into_iter()
.map(|clause| (Occur::Should, clause))
.collect(),
)
};
Ok(result_ast)
}
}
}
}
/// Compose two occur values.
fn compose_occur(left: Occur, right: Occur) -> Occur {
match left {
Occur::Should => right,
Occur::Must => {
if right == Occur::MustNot {
Occur::MustNot
} else {
Occur::Must
}
}
Occur::MustNot => {
if right == Occur::MustNot {
Occur::Must
} else {
Occur::MustNot
}
}
}
}

View File

@@ -1,4 +1,41 @@
use std::fmt;
use std::fmt::{Debug, Formatter};
use query::Occur;
pub enum UserInputLeaf {
Literal(UserInputLiteral),
All,
Range {
field: Option<String>,
lower: UserInputBound,
upper: UserInputBound,
},
}
impl Debug for UserInputLeaf {
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
match self {
UserInputLeaf::Literal(literal) => {
literal.fmt(formatter)
}
UserInputLeaf::Range {
ref field,
ref lower,
ref upper,
} => {
if let &Some(ref field) = field {
write!(formatter, "{}:", field)?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
upper.display_upper(formatter)?;
Ok(())
}
UserInputLeaf::All => write!(formatter, "*"),
}
}
}
pub struct UserInputLiteral {
pub field_name: Option<String>,
@@ -43,28 +80,99 @@ impl UserInputBound {
}
pub enum UserInputAST {
Clause(Vec<Box<UserInputAST>>),
Not(Box<UserInputAST>),
Must(Box<UserInputAST>),
Range {
field: Option<String>,
lower: UserInputBound,
upper: UserInputBound,
},
All,
Leaf(Box<UserInputLiteral>),
Clause(Vec<UserInputAST>),
Unary(Occur, Box<UserInputAST>),
// Not(Box<UserInputAST>),
// Should(Box<UserInputAST>),
// Must(Box<UserInputAST>),
Leaf(Box<UserInputLeaf>),
}
impl From<UserInputLiteral> for UserInputAST {
fn from(literal: UserInputLiteral) -> UserInputAST {
UserInputAST::Leaf(Box::new(literal))
impl UserInputAST {
pub fn unary(self, occur: Occur) -> UserInputAST {
UserInputAST::Unary(occur, Box::new(self))
}
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
assert!(occur != Occur::MustNot);
assert!(!asts.is_empty());
if asts.len() == 1 {
asts.into_iter().next().unwrap() //< safe
} else {
UserInputAST::Clause(asts
.into_iter()
.map(|ast: UserInputAST|
ast.unary(occur)
)
.collect::<Vec<_>>()
)
}
}
pub fn and(asts: Vec<UserInputAST>) -> UserInputAST {
UserInputAST::compose(Occur::Must, asts)
}
pub fn or(asts: Vec<UserInputAST>) -> UserInputAST {
UserInputAST::compose(Occur::Should, asts)
}
}
/*
impl UserInputAST {
fn compose_occur(self, occur: Occur) -> UserInputAST {
match self {
UserInputAST::Not(other) => {
let new_occur = compose_occur(Occur::MustNot, occur);
other.simplify()
}
_ => {
self
}
}
}
pub fn simplify(self) -> UserInputAST {
match self {
UserInputAST::Clause(els) => {
if els.len() == 1 {
return els.into_iter().next().unwrap();
} else {
return self;
}
}
UserInputAST::Not(els) => {
if els.len() == 1 {
return els.into_iter().next().unwrap();
} else {
return self;
}
}
}
}
}
*/
impl From<UserInputLiteral> for UserInputLeaf {
fn from(literal: UserInputLiteral) -> UserInputLeaf {
UserInputLeaf::Literal(literal)
}
}
impl From<UserInputLeaf> for UserInputAST {
fn from(leaf: UserInputLeaf) -> UserInputAST {
UserInputAST::Leaf(Box::new(leaf))
}
}
impl fmt::Debug for UserInputAST {
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
UserInputAST::Must(ref subquery) => write!(formatter, "+({:?})", subquery),
UserInputAST::Clause(ref subqueries) => {
if subqueries.is_empty() {
write!(formatter, "<emptyclause>")?;
@@ -78,21 +186,9 @@ impl fmt::Debug for UserInputAST {
}
Ok(())
}
UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery),
UserInputAST::Range {
ref field,
ref lower,
ref upper,
} => {
if let &Some(ref field) = field {
write!(formatter, "{}:", field)?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
upper.display_upper(formatter)?;
Ok(())
UserInputAST::Unary(ref occur, ref subquery) => {
write!(formatter, "{}({:?})", occur.to_char(), subquery)
}
UserInputAST::All => write!(formatter, "*"),
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
}
}