Files
tantivy/src/query/query_parser/query_parser.rs
Paul Masurel 69e11d3779 issue/57 Cleaning.
Closes #57
Closes #56
Closes #23
2016-11-17 23:18:24 +09:00

344 lines
14 KiB
Rust

use schema::{Schema, Field};
use query::Query;
use query::BooleanQuery;
use super::logical_ast::*;
use super::user_input_ast::*;
use super::query_grammar::parse_to_ast;
use query::Occur;
use query::TermQuery;
use postings::SegmentPostingsOption;
use query::PhraseQuery;
use analyzer::SimpleTokenizer;
use analyzer::StreamingIterator;
use schema::Term;
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq)]
pub enum QueryParserError {
/// Error in the query syntax
SyntaxError,
/// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema
FieldDoesNotExist(String),
/// `ExpectedU32(field_name: String, field_value: String)`
/// The query contains a term for a `u32`-field, but the value
/// is not a u32.
ExpectedU32(String, String),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
AllButQueryForbidden,
/// If no default field is declared, running a query without any
/// field specified is forbbidden.
NoDefaultFieldDeclared,
}
/// Tantivy's Query parser
///
/// The language covered by the current parser is extremely simple.
///
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
/// The terms are then searched within the default terms of the query parser.
///
/// e.g. If `body` and `title` are default fields, our example terms are
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
/// By default, all tokenized and indexed fields are default fields.
///
/// Multiple terms are handled as an `OR` : any document containing at least
/// one of the term will go through the scoring.
///
/// This behavior is slower, but is not a bad idea if the user is sorting
/// by relevance : The user typically just scans through the first few
/// documents in order of decreasing relevance and will stop when the documents
/// are not relevant anymore.
/// Making it possible to make this behavior customizable is tracked in
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
///
/// * negative terms: By prepending a term by a `-`, a term can be excluded
/// from the search. This is useful for disambiguating a query.
/// e.g. `apple -fruit`
///
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
///
pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
conjunction_by_default: bool,
analyzer: Box<SimpleTokenizer>,
}
impl QueryParser {
/// Creates a `QueryParser`
/// * schema - index Schema
/// * default_fields - fields used to search if no field is specifically defined
/// in the query.
pub fn new(schema: Schema, default_fields: Vec<Field>) -> QueryParser {
QueryParser {
schema: schema,
default_fields: default_fields,
conjunction_by_default: false,
analyzer: box SimpleTokenizer,
}
}
/// Set the default way to compose queries to a conjunction.
///
/// By default a ,
pub fn set_conjunction_by_default(&mut self) {
self.conjunction_by_default = true;
}
/// Parse a query
///
/// Note that `parse_query` returns an error if the input
/// is not a valid query.
///
/// There is currently no lenient mode for the query parser
/// which makes it a bad choice for a public/broad user search engine.
///
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(logical_ast))
}
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, _remaining) =
parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema
.get_field(field_name)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
}
fn compute_logical_ast(&self,
user_input_ast: UserInputAST)
-> Result<LogicalAST, QueryParserError> {
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
if occur == Occur::MustNot {
return Err(QueryParserError::AllButQueryForbidden);
}
Ok(ast)
}
fn compute_logical_ast_for_leaf(&self,
field: Field,
phrase: &str)
-> Result<Option<LogicalLiteral>, QueryParserError> {
let mut token_iter = self.analyzer.tokenize(phrase);
let mut tokens: Vec<Term> = Vec::new();
loop {
if let Some(token) = token_iter.next() {
let text = token.to_string();
// TODO Handle u32
let term = Term::from_field_text(field, &text);
tokens.push(term);
} else {
break;
}
}
if tokens.is_empty() {
Ok(None)
} else if tokens.len() == 1 {
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
} else {
Ok(Some(LogicalLiteral::Phrase(tokens)))
}
}
fn default_occur(&self) -> Occur {
if self.conjunction_by_default {
Occur::Must
} else {
Occur::Should
}
}
fn compute_logical_ast_with_occur(&self,
user_input_ast: UserInputAST)
-> Result<(Occur, LogicalAST), QueryParserError> {
match user_input_ast {
UserInputAST::Clause(sub_queries) => {
let default_occur = self.default_occur();
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res| {
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
})
.collect());
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Not(subquery) => {
let (occur, logical_sub_queries) =
try!(self.compute_logical_ast_with_occur(*subquery));
Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
}
UserInputAST::Must(subquery) => {
let (occur, logical_sub_queries) =
try!(self.compute_logical_ast_with_occur(*subquery));
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
}
UserInputAST::Leaf(literal) => {
let term_phrases: Vec<(Field, String)> = match literal.field_name {
Some(ref field_name) => {
let field = try!(self.resolve_field_name(&field_name));
vec![(field, literal.phrase.clone())]
}
None => {
if self.default_fields.len() == 0 {
return Err(QueryParserError::NoDefaultFieldDeclared);
} else if self.default_fields.len() == 1 {
vec![(self.default_fields[0], literal.phrase.clone())]
} else {
self.default_fields
.iter()
.map(|default_field| (*default_field, literal.phrase.clone()))
.collect()
}
}
};
let mut asts: Vec<LogicalAST> = Vec::new();
for (field, phrase) in term_phrases {
if let Some(ast) = try!(self.compute_logical_ast_for_leaf(field, &phrase)) {
asts.push(LogicalAST::Leaf(box ast));
}
}
let result_ast = if asts.len() == 0 {
panic!("not working");
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
Ok((Occur::Should, result_ast))
}
}
}
}
/// Compose two occur values.
fn compose_occur(left: Occur, right: Occur) -> Occur {
match left {
Occur::Should => right,
Occur::Must => {
if right == Occur::MustNot {
Occur::MustNot
} else {
Occur::Must
}
}
Occur::MustNot => {
if right == Occur::MustNot {
Occur::Must
} else {
Occur::MustNot
}
}
}
}
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
match logical_literal {
LogicalLiteral::Term(term) => box TermQuery::new(term, SegmentPostingsOption::Freq),
LogicalLiteral::Phrase(terms) => box PhraseQuery::from(terms),
}
}
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
match logical_ast {
LogicalAST::Clause(clause) => {
let occur_subqueries = clause.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
box BooleanQuery::from(occur_subqueries)
}
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
}
}
#[cfg(test)]
mod test {
use schema::{SchemaBuilder, TEXT};
use super::QueryParser;
use super::QueryParserError;
use super::super::logical_ast::*;
fn parse_query_to_logical_ast(query: &str,
default_conjunction: bool)
-> Result<LogicalAST, QueryParserError> {
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let default_fields = vec![title, text];
let mut query_parser = QueryParser::new(schema, default_fields);
if default_conjunction {
query_parser.set_conjunction_by_default();
}
query_parser.parse_query_to_logical_ast(query)
}
fn test_parse_query_to_logical_ast_helper(query: &str,
expected: &str,
default_conjunction: bool) {
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
}
#[test]
pub fn test_parse_query_to_ast_disjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(Term([0, 97]) (Term([0, 98]) Term([1, 98])))",
false);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 97]), Term([0, 98])]\"",
false);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))",
true);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 97]), Term([0, 98])]\"",
true);
}
}