issue/57 Cleaning.

Closes #57
Closes #56
Closes #23
This commit is contained in:
Paul Masurel
2016-11-17 23:18:24 +09:00
parent 3f20788a36
commit 69e11d3779
15 changed files with 259 additions and 345 deletions

View File

@@ -1,7 +1,7 @@
mod query_parser;
mod query_grammar;
mod user_input_ast;
mod logical_ast;
pub mod logical_ast;
pub use self::query_parser::QueryParser;
pub use self::query_parser::QueryParserError;

View File

@@ -2,79 +2,83 @@ use combine::*;
use combine::char::*;
use super::user_input_ast::*;
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char> {
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
{
let term_val = || {
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
let phrase =
(char('"'), many1(satisfy(|c| c != '"')), char('"'),)
.map(|(_, s, _)| s);
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
phrase.or(word)
};
let field = many1(letter());
let term_query = (field, char(':'), term_val())
.map(|(field_name,_, phrase)| {
UserInputLiteral {
field_name: Some(field_name),
phrase: phrase
}
});
let term_default_field = term_val()
.map(|phrase| {
UserInputLiteral {
field_name: None,
phrase: phrase
}
});
try(term_query).or(term_default_field)
.map(|query_literal| UserInputAST::from(query_literal))
.parse_stream(input)
}
let term_query = (field, char(':'), term_val()).map(|(field_name, _, phrase)| {
UserInputLiteral {
field_name: Some(field_name),
phrase: phrase,
}
});
let term_default_field = term_val().map(|phrase| {
UserInputLiteral {
field_name: None,
phrase: phrase,
}
});
try(term_query)
.or(term_default_field)
.map(|query_literal| UserInputAST::from(query_literal))
.parse_stream(input)
}
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char> {
(char('-'), parser(literal)).map(|(_, expr)| UserInputAST::Not(box expr))
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
.or(parser(literal))
.parse_stream(input)
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
{
(char('-'), parser(literal))
.map(|(_, expr)| UserInputAST::Not(box expr))
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
.or(parser(literal))
.parse_stream(input)
}
pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
{
{
sep_by(parser(leaf), spaces())
.map(|subqueries: Vec<UserInputAST>| {
if subqueries.len() == 1 {
subqueries.into_iter().next().unwrap()
}
else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
}
})
.parse_stream(input)
.map(|subqueries: Vec<UserInputAST>| {
if subqueries.len() == 1 {
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
}
})
.parse_stream(input)
}
#[cfg(test)]
mod test {
use super::*;
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
let query = parse_to_ast(query).unwrap().0;
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
}
fn test_is_parse_err(query: &str) {
assert!(parse_to_ast(query).is_err());
}
#[test]
pub fn test_parse_query_to_ast() {
fn test_parse_query_to_ast() {
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
test_parse_query_to_ast_helper("+abc:toto -titi", "+(abc:\"toto\") -(\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
test_parse_query_to_ast_helper("abc:a b", "abc:\"a\" \"b\"");
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
test_is_parse_err("abc + ");
}
}

View File

@@ -6,6 +6,7 @@ use super::user_input_ast::*;
use super::query_grammar::parse_to_ast;
use query::Occur;
use query::TermQuery;
use postings::SegmentPostingsOption;
use query::PhraseQuery;
use analyzer::SimpleTokenizer;
use analyzer::StreamingIterator;
@@ -25,9 +26,10 @@ pub enum QueryParserError {
/// The query contains a term for a `u32`-field, but the value
/// is not a u32.
ExpectedU32(String, String),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
AllButQueryForbidden,
/// If no default field is declared, running a query without any
/// field specified is forbbidden.
NoDefaultFieldDeclared,
}
@@ -37,14 +39,14 @@ pub enum QueryParserError {
///
/// The language covered by the current parser is extremely simple.
///
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
/// The terms are then searched within the default terms of the query parser.
///
///
/// e.g. If `body` and `title` are default fields, our example terms are
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
/// By default, all tokenized and indexed fields are default fields.
///
///
/// Multiple terms are handled as an `OR` : any document containing at least
/// one of the term will go through the scoring.
///
@@ -54,13 +56,13 @@ pub enum QueryParserError {
/// are not relevant anymore.
/// Making it possible to make this behavior customizable is tracked in
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
///
///
/// * negative terms: By prepending a term by a `-`, a term can be excluded
/// from the search. This is useful for disambiguating a query.
/// e.g. `apple -fruit`
/// e.g. `apple -fruit`
///
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
///
///
pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
@@ -73,16 +75,18 @@ impl QueryParser {
/// * schema - index Schema
/// * default_fields - fields used to search if no field is specifically defined
/// in the query.
pub fn new(schema: Schema,
default_fields: Vec<Field>) -> QueryParser {
pub fn new(schema: Schema, default_fields: Vec<Field>) -> QueryParser {
QueryParser {
schema: schema,
default_fields: default_fields,
conjunction_by_default: false,
analyzer: box SimpleTokenizer,
}
}
}
/// Set the default way to compose queries to a conjunction.
///
/// By default a ,
pub fn set_conjunction_by_default(&mut self) {
self.conjunction_by_default = true;
}
@@ -91,36 +95,44 @@ impl QueryParser {
///
/// Note that `parse_query` returns an error if the input
/// is not a valid query.
///
///
/// There is currently no lenient mode for the query parser
/// which makes it a bad choice for a public/broad user search engine.
///
/// Implementing a lenient mode for this query parser is tracked
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(self.convert_to_query(logical_ast))
Ok(convert_to_query(logical_ast))
}
pub fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, remaining) = parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, _remaining) =
parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema.get_field(field_name)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
self.schema
.get_field(field_name)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
}
pub fn compute_logical_ast(&self, user_input_ast: UserInputAST) -> Result<LogicalAST, QueryParserError> {
fn compute_logical_ast(&self,
user_input_ast: UserInputAST)
-> Result<LogicalAST, QueryParserError> {
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
if occur == Occur::MustNot {
return Err(QueryParserError::AllButQueryForbidden)
return Err(QueryParserError::AllButQueryForbidden);
}
Ok(ast)
Ok(ast)
}
fn compute_logical_ast_for_leaf(&self, field: Field, phrase: &str) -> Result<Option<LogicalLiteral>, QueryParserError> {
fn compute_logical_ast_for_leaf(&self,
field: Field,
phrase: &str)
-> Result<Option<LogicalLiteral>, QueryParserError> {
let mut token_iter = self.analyzer.tokenize(phrase);
let mut tokens: Vec<Term> = Vec::new();
loop {
@@ -129,98 +141,63 @@ impl QueryParser {
// TODO Handle u32
let term = Term::from_field_text(field, &text);
tokens.push(term);
}
else {
} else {
break;
}
}
if tokens.is_empty() {
Ok(None)
}
else if tokens.len() == 1 {
} else if tokens.len() == 1 {
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
}
else {
} else {
Ok(Some(LogicalLiteral::Phrase(tokens)))
}
}
fn default_occur(&self) -> Occur {
if self.conjunction_by_default {
Occur::Must
}
else {
} else {
Occur::Should
}
}
fn convert_literal_to_query(&self, logical_literal: LogicalLiteral) -> Box<Query> {
match logical_literal {
LogicalLiteral::Term(term) => {
let field = term.field();
TODO check the schema to get the correct segment otpins
box TermQuery::from(term)
}
LogicalLiteral::Phrase(terms) => {
TODO check the schema to get the correct segment otpins
box PhraseQuery::from(terms)
}
}
}
fn convert_to_query(&self, logical_ast: LogicalAST) -> Box<Query> {
match logical_ast {
LogicalAST::Clause(clause) => {
let occur_subqueries = clause.into_iter()
.map(|(occur, subquery)| (occur, self.convert_to_query(subquery)))
.collect::<Vec<_>>();
box BooleanQuery::from(occur_subqueries)
}
LogicalAST::Leaf(logical_literal) => {
self.convert_literal_to_query(*logical_literal)
}
}
}
pub fn compute_logical_ast_with_occur(&self, user_input_ast: UserInputAST) -> Result<(Occur, LogicalAST), QueryParserError> {
fn compute_logical_ast_with_occur(&self,
user_input_ast: UserInputAST)
-> Result<(Occur, LogicalAST), QueryParserError> {
match user_input_ast {
UserInputAST::Clause(sub_queries) => {
let default_occur = self.default_occur();
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries
.into_iter()
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res|
res.map(
|(occur, sub_ast)| (default_occur.compose(occur), sub_ast)
)
)
.map(|res| {
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
})
.collect());
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Not(subquery) => {
let (occur, logical_sub_queries) = try!(self.compute_logical_ast_with_occur(*subquery));
Ok((Occur::MustNot.compose(occur), logical_sub_queries))
},
let (occur, logical_sub_queries) =
try!(self.compute_logical_ast_with_occur(*subquery));
Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
}
UserInputAST::Must(subquery) => {
let (occur, logical_sub_queries) = try!(self.compute_logical_ast_with_occur(*subquery));
Ok((Occur::Must.compose(occur), logical_sub_queries))
},
let (occur, logical_sub_queries) =
try!(self.compute_logical_ast_with_occur(*subquery));
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
}
UserInputAST::Leaf(literal) => {
let term_phrases: Vec<(Field, String)> = match literal.field_name {
Some(ref field_name) => {
let field = try!(self.resolve_field_name(&field_name));
vec!((field, literal.phrase.clone()))
vec![(field, literal.phrase.clone())]
}
None => {
if self.default_fields.len() == 0 {
return Err(QueryParserError::NoDefaultFieldDeclared)
}
else if self.default_fields.len() == 1 {
vec!((self.default_fields[0], literal.phrase.clone()))
}
else {
return Err(QueryParserError::NoDefaultFieldDeclared);
} else if self.default_fields.len() == 1 {
vec![(self.default_fields[0], literal.phrase.clone())]
} else {
self.default_fields
.iter()
.map(|default_field| (*default_field, literal.phrase.clone()))
@@ -234,29 +211,61 @@ impl QueryParser {
asts.push(LogicalAST::Leaf(box ast));
}
}
let result_ast =
if asts.len() == 0 {
panic!("not working");
}
else if asts.len() == 1 {
asts[0].clone()
}
else {
LogicalAST::Clause(asts
.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
let result_ast = if asts.len() == 0 {
panic!("not working");
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
Ok((Occur::Should, result_ast))
}
}
}
}
}
/// Compose two occur values.
fn compose_occur(left: Occur, right: Occur) -> Occur {
match left {
Occur::Should => right,
Occur::Must => {
if right == Occur::MustNot {
Occur::MustNot
} else {
Occur::Must
}
}
Occur::MustNot => {
if right == Occur::MustNot {
Occur::Must
} else {
Occur::MustNot
}
}
}
}
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
match logical_literal {
LogicalLiteral::Term(term) => box TermQuery::new(term, SegmentPostingsOption::Freq),
LogicalLiteral::Phrase(terms) => box PhraseQuery::from(terms),
}
}
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
match logical_ast {
LogicalAST::Clause(clause) => {
let occur_subqueries = clause.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
box BooleanQuery::from(occur_subqueries)
}
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
}
}
@@ -266,13 +275,15 @@ mod test {
use super::QueryParser;
use super::QueryParserError;
use super::super::logical_ast::*;
fn parse_query_to_logical_ast(query: &str, default_conjunction: bool) -> Result<LogicalAST, QueryParserError> {
fn parse_query_to_logical_ast(query: &str,
default_conjunction: bool)
-> Result<LogicalAST, QueryParserError> {
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let default_fields = vec!(title, text);
let default_fields = vec![title, text];
let mut query_parser = QueryParser::new(schema, default_fields);
if default_conjunction {
query_parser.set_conjunction_by_default();
@@ -280,7 +291,9 @@ mod test {
query_parser.parse_query_to_logical_ast(query)
}
fn test_parse_query_to_logical_ast_helper(query: &str, expected: &str, default_conjunction: bool) {
fn test_parse_query_to_logical_ast_helper(query: &str,
expected: &str,
default_conjunction: bool) {
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
@@ -288,21 +301,43 @@ mod test {
#[test]
pub fn test_parse_query_to_ast_disjunction() {
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", false);
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", false);
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(), QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b", "(Term([0, 97]) (Term([0, 98]) Term([1, 98])))", false);
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", false);
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(Term([0, 97]) (Term([0, 98]) Term([1, 98])))",
false);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 97]), Term([0, 98])]\"",
false);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", true);
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(), QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b", "(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))", true);
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", true);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))",
true);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 97]), Term([0, 98])]\"",
true);
}
}