mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-01 08:00:41 +00:00
issue/57 Switch to the new query parser.
This commit is contained in:
66
src/query/query_parser/logical_ast.rs
Normal file
66
src/query/query_parser/logical_ast.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use std::fmt;
|
||||
use schema::Term;
|
||||
use query::Occur;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<Term>),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalAST{
|
||||
Clause(Vec<(Occur, LogicalAST)>),
|
||||
Leaf(Box<LogicalLiteral>)
|
||||
}
|
||||
|
||||
fn occur_letter(occur: Occur) -> &'static str {
|
||||
match occur {
|
||||
Occur::Must => "+",
|
||||
Occur::MustNot => "-",
|
||||
Occur::Should => "",
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for LogicalAST {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalAST::Clause(ref clause) => {
|
||||
if clause.is_empty() {
|
||||
try!(write!(formatter, "<emptyclause>"));
|
||||
}
|
||||
else {
|
||||
let (ref occur, ref subquery) = clause[0];
|
||||
try!(write!(formatter, "({}{:?}", occur_letter(*occur), subquery));
|
||||
for &(ref occur, ref subquery) in &clause[1..] {
|
||||
try!(write!(formatter, " {}{:?}", occur_letter(*occur), subquery));
|
||||
}
|
||||
try!(formatter.write_str(")"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LogicalAST::Leaf(ref literal) => {
|
||||
write!(formatter, "{:?}", literal)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LogicalLiteral> for LogicalAST {
|
||||
fn from(literal: LogicalLiteral) -> LogicalAST {
|
||||
LogicalAST::Leaf(box literal)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for LogicalLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => {
|
||||
write!(formatter, "{:?}", term)
|
||||
},
|
||||
LogicalLiteral::Phrase(ref terms) => {
|
||||
write!(formatter, "\"{:?}\"", terms)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
7
src/query/query_parser/mod.rs
Normal file
7
src/query/query_parser/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
mod query_parser;
|
||||
mod query_grammar;
|
||||
mod user_input_ast;
|
||||
mod logical_ast;
|
||||
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
80
src/query/query_parser/query_grammar.rs
Normal file
80
src/query/query_parser/query_grammar.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
use combine::*;
|
||||
use combine::char::*;
|
||||
use super::user_input_ast::*;
|
||||
|
||||
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char> {
|
||||
let term_val = || {
|
||||
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
|
||||
let phrase =
|
||||
(char('"'), many1(satisfy(|c| c != '"')), char('"'),)
|
||||
.map(|(_, s, _)| s);
|
||||
phrase.or(word)
|
||||
};
|
||||
let field = many1(letter());
|
||||
let term_query = (field, char(':'), term_val())
|
||||
.map(|(field_name,_, phrase)| {
|
||||
UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase: phrase
|
||||
}
|
||||
});
|
||||
let term_default_field = term_val()
|
||||
.map(|phrase| {
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase: phrase
|
||||
}
|
||||
});
|
||||
try(term_query).or(term_default_field)
|
||||
.map(|query_literal| UserInputAST::from(query_literal))
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char> {
|
||||
(char('-'), parser(literal)).map(|(_, expr)| UserInputAST::Not(box expr))
|
||||
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
|
||||
.or(parser(literal))
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
{
|
||||
sep_by(parser(leaf), spaces())
|
||||
.map(|subqueries: Vec<UserInputAST>| {
|
||||
if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
}
|
||||
else {
|
||||
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
|
||||
}
|
||||
})
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::*;
|
||||
|
||||
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
|
||||
let query = parse_to_ast(query).unwrap().0;
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast() {
|
||||
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("+abc:toto -titi", "+(abc:\"toto\") -(\"titi\")");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "abc:\"a\" \"b\"");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
}
|
||||
}
|
||||
301
src/query/query_parser/query_parser.rs
Normal file
301
src/query/query_parser/query_parser.rs
Normal file
@@ -0,0 +1,301 @@
|
||||
use schema::{Schema, Field};
|
||||
use query::Query;
|
||||
use query::BooleanQuery;
|
||||
use super::logical_ast::*;
|
||||
use super::user_input_ast::*;
|
||||
use super::query_grammar::parse_to_ast;
|
||||
use query::Occur;
|
||||
use query::TermQuery;
|
||||
use query::PhraseQuery;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::StreamingIterator;
|
||||
use schema::Term;
|
||||
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum QueryParserError {
|
||||
/// Error in the query syntax
|
||||
SyntaxError,
|
||||
/// `FieldDoesNotExist(field_name: String)`
|
||||
/// The query references a field that is not in the schema
|
||||
FieldDoesNotExist(String),
|
||||
/// `ExpectedU32(field_name: String, field_value: String)`
|
||||
/// The query contains a term for a `u32`-field, but the value
|
||||
/// is not a u32.
|
||||
ExpectedU32(String, String),
|
||||
|
||||
AllButQueryForbidden,
|
||||
|
||||
NoDefaultFieldDeclared,
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
/// The language covered by the current parser is extremely simple.
|
||||
///
|
||||
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
|
||||
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
|
||||
/// The terms are then searched within the default terms of the query parser.
|
||||
///
|
||||
/// e.g. If `body` and `title` are default fields, our example terms are
|
||||
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
|
||||
/// By default, all tokenized and indexed fields are default fields.
|
||||
///
|
||||
/// Multiple terms are handled as an `OR` : any document containing at least
|
||||
/// one of the term will go through the scoring.
|
||||
///
|
||||
/// This behavior is slower, but is not a bad idea if the user is sorting
|
||||
/// by relevance : The user typically just scans through the first few
|
||||
/// documents in order of decreasing relevance and will stop when the documents
|
||||
/// are not relevant anymore.
|
||||
/// Making it possible to make this behavior customizable is tracked in
|
||||
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
|
||||
///
|
||||
/// * negative terms: By prepending a term by a `-`, a term can be excluded
|
||||
/// from the search. This is useful for disambiguating a query.
|
||||
/// e.g. `apple -fruit`
|
||||
///
|
||||
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
|
||||
///
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
conjunction_by_default: bool,
|
||||
analyzer: Box<SimpleTokenizer>,
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`
|
||||
/// * schema - index Schema
|
||||
/// * default_fields - fields used to search if no field is specifically defined
|
||||
/// in the query.
|
||||
pub fn new(schema: Schema,
|
||||
default_fields: Vec<Field>) -> QueryParser {
|
||||
QueryParser {
|
||||
schema: schema,
|
||||
default_fields: default_fields,
|
||||
conjunction_by_default: false,
|
||||
analyzer: box SimpleTokenizer,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_conjunction_by_default(&mut self) {
|
||||
self.conjunction_by_default = true;
|
||||
}
|
||||
|
||||
/// Parse a query
|
||||
///
|
||||
/// Note that `parse_query` returns an error if the input
|
||||
/// is not a valid query.
|
||||
///
|
||||
/// There is currently no lenient mode for the query parser
|
||||
/// which makes it a bad choice for a public/broad user search engine.
|
||||
///
|
||||
/// Implementing a lenient mode for this query parser is tracked
|
||||
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
|
||||
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
|
||||
let logical_ast = self.parse_query_to_logical_ast(query)?;
|
||||
Ok(convert_to_query(logical_ast))
|
||||
}
|
||||
|
||||
pub fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
let (user_input_ast, remaining) = parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
||||
self.schema.get_field(field_name)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
|
||||
}
|
||||
|
||||
pub fn compute_logical_ast(&self, user_input_ast: UserInputAST) -> Result<LogicalAST, QueryParserError> {
|
||||
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
|
||||
if occur == Occur::MustNot {
|
||||
return Err(QueryParserError::AllButQueryForbidden)
|
||||
}
|
||||
Ok(ast)
|
||||
}
|
||||
|
||||
fn compute_logical_ast_for_leaf(&self, field: Field, phrase: &str) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let mut token_iter = self.analyzer.tokenize(phrase);
|
||||
let mut tokens: Vec<Term> = Vec::new();
|
||||
loop {
|
||||
if let Some(token) = token_iter.next() {
|
||||
let text = token.to_string();
|
||||
// TODO Handle u32
|
||||
let term = Term::from_field_text(field, &text);
|
||||
tokens.push(term);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if tokens.is_empty() {
|
||||
Ok(None)
|
||||
}
|
||||
else if tokens.len() == 1 {
|
||||
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
|
||||
}
|
||||
else {
|
||||
Ok(Some(LogicalLiteral::Phrase(tokens)))
|
||||
}
|
||||
}
|
||||
|
||||
fn default_occur(&self) -> Occur {
|
||||
if self.conjunction_by_default {
|
||||
Occur::Must
|
||||
}
|
||||
else {
|
||||
Occur::Should
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn compute_logical_ast_with_occur(&self, user_input_ast: UserInputAST) -> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries
|
||||
.into_iter()
|
||||
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
||||
.map(|res|
|
||||
res.map(
|
||||
|(occur, sub_ast)| (default_occur.compose(occur), sub_ast)
|
||||
)
|
||||
)
|
||||
.collect());
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
UserInputAST::Not(subquery) => {
|
||||
let (occur, logical_sub_queries) = try!(self.compute_logical_ast_with_occur(*subquery));
|
||||
Ok((Occur::MustNot.compose(occur), logical_sub_queries))
|
||||
},
|
||||
UserInputAST::Must(subquery) => {
|
||||
let (occur, logical_sub_queries) = try!(self.compute_logical_ast_with_occur(*subquery));
|
||||
Ok((Occur::Must.compose(occur), logical_sub_queries))
|
||||
},
|
||||
UserInputAST::Leaf(literal) => {
|
||||
let term_phrases: Vec<(Field, String)> = match literal.field_name {
|
||||
Some(ref field_name) => {
|
||||
let field = try!(self.resolve_field_name(&field_name));
|
||||
vec!((field, literal.phrase.clone()))
|
||||
}
|
||||
None => {
|
||||
if self.default_fields.len() == 0 {
|
||||
return Err(QueryParserError::NoDefaultFieldDeclared)
|
||||
}
|
||||
else if self.default_fields.len() == 1 {
|
||||
vec!((self.default_fields[0], literal.phrase.clone()))
|
||||
}
|
||||
else {
|
||||
self.default_fields
|
||||
.iter()
|
||||
.map(|default_field| (*default_field, literal.phrase.clone()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut asts: Vec<LogicalAST> = Vec::new();
|
||||
for (field, phrase) in term_phrases {
|
||||
if let Some(ast) = try!(self.compute_logical_ast_for_leaf(field, &phrase)) {
|
||||
asts.push(LogicalAST::Leaf(box ast));
|
||||
}
|
||||
}
|
||||
let result_ast =
|
||||
if asts.len() == 0 {
|
||||
panic!("not working");
|
||||
}
|
||||
else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
}
|
||||
else {
|
||||
LogicalAST::Clause(asts
|
||||
.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
Ok((Occur::Should, result_ast))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
match logical_literal {
|
||||
LogicalLiteral::Term(term) => {
|
||||
box TermQuery::from(term)
|
||||
}
|
||||
LogicalLiteral::Phrase(terms) => {
|
||||
box PhraseQuery::from(terms)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(clause) => {
|
||||
let occur_subqueries = clause.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
box BooleanQuery::from(occur_subqueries)
|
||||
}
|
||||
LogicalAST::Leaf(logical_literal) => {
|
||||
convert_literal_to_query(*logical_literal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use super::super::logical_ast::*;
|
||||
|
||||
fn parse_query_to_logical_ast(query: &str, default_conjunction: bool) -> Result<LogicalAST, QueryParserError> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec!(title, text);
|
||||
let mut query_parser = QueryParser::new(schema, default_fields);
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
}
|
||||
query_parser.parse_query_to_logical_ast(query)
|
||||
}
|
||||
|
||||
fn test_parse_query_to_logical_ast_helper(query: &str, expected: &str, default_conjunction: bool) {
|
||||
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", false);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(), QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b", "(Term([0, 97]) (Term([0, 98]) Term([1, 98])))", false);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", true);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(), QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b", "(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))", true);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", true);
|
||||
}
|
||||
}
|
||||
62
src/query/query_parser/user_input_ast.rs
Normal file
62
src/query/query_parser/user_input_ast.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
use std::fmt;
|
||||
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match self.field_name {
|
||||
Some(ref field_name) => {
|
||||
write!(formatter, "{}:\"{}\"", field_name, self.phrase)
|
||||
}
|
||||
None => {
|
||||
write!(formatter, "\"{}\"", self.phrase)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<Box<UserInputAST>>),
|
||||
Not(Box<UserInputAST>),
|
||||
Must(Box<UserInputAST>),
|
||||
Leaf(Box<UserInputLiteral>)
|
||||
|
||||
}
|
||||
|
||||
impl From<UserInputLiteral> for UserInputAST {
|
||||
fn from(literal: UserInputLiteral) -> UserInputAST {
|
||||
UserInputAST::Leaf(box literal)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputAST {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputAST::Must(ref subquery) => {
|
||||
write!(formatter, "+({:?})", subquery)
|
||||
},
|
||||
UserInputAST::Clause(ref subqueries) => {
|
||||
if subqueries.is_empty() {
|
||||
try!(write!(formatter, "<emptyclause>"));
|
||||
}
|
||||
else {
|
||||
try!(write!(formatter, "{:?}", &subqueries[0]));
|
||||
for subquery in &subqueries[1..] {
|
||||
try!(write!(formatter, " {:?}", subquery));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
},
|
||||
UserInputAST::Not(ref subquery) => {
|
||||
write!(formatter, "-({:?})", subquery)
|
||||
}
|
||||
UserInputAST::Leaf(ref subquery) => {
|
||||
write!(formatter, "{:?}", subquery)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user