issue/57 Switch to the new query parser.

This commit is contained in:
Paul Masurel
2016-11-16 00:53:04 +09:00
parent 0fc265eb2d
commit 831ed4caf6
11 changed files with 100 additions and 143 deletions

View File

@@ -56,9 +56,6 @@ pub struct SimpleTokenizer;
impl SimpleTokenizer {
pub fn new() -> SimpleTokenizer {
SimpleTokenizer
}
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
TokenIter {
@@ -71,7 +68,7 @@ impl SimpleTokenizer {
#[test]
fn test_tokenizer() {
let simple_tokenizer = SimpleTokenizer::new();
let simple_tokenizer = SimpleTokenizer;
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
assert_eq!(term_reader.next().unwrap(), "hello");
assert_eq!(term_reader.next().unwrap(), "happy");
@@ -83,7 +80,7 @@ fn test_tokenizer() {
#[test]
fn test_tokenizer_empty() {
let simple_tokenizer = SimpleTokenizer::new();
let simple_tokenizer = SimpleTokenizer;
let mut term_reader = simple_tokenizer.tokenize("");
assert_eq!(term_reader.next(), None);
}

View File

@@ -45,8 +45,8 @@ impl From<io::Error> for Error {
}
}
impl From<query::ParsingError> for Error {
fn from(parsing_error: query::ParsingError) -> Error {
impl From<query::QueryParserError> for Error {
fn from(parsing_error: query::QueryParserError) -> Error {
Error::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
}
}

View File

@@ -8,23 +8,22 @@ mod boolean_query;
mod multi_term_query;
mod phrase_query;
mod scorer;
mod query_parser;
mod occur;
mod weight;
mod occur_filter;
mod term_query;
mod query_parser2;
mod query_parser;
pub use self::occur_filter::OccurFilter;
pub use self::boolean_query::BooleanQuery;
pub use self::occur::Occur;
pub use self::query::Query;
pub use self::term_query::TermQuery;
pub use self::phrase_query::PhraseQuery;
pub use self::multi_term_query::MultiTermQuery;
pub use self::multi_term_query::MultiTermWeight;
pub use self::scorer::Scorer;
pub use self::scorer::EmptyScorer;
pub use self::occur_filter::OccurFilter;
pub use self::occur::Occur;
pub use self::phrase_query::PhraseQuery;
pub use self::query_parser::QueryParserError;
pub use self::query_parser::QueryParser;
pub use self::query_parser::ParsingError;
pub use self::query::Query;
pub use self::scorer::EmptyScorer;
pub use self::scorer::Scorer;
pub use self::term_query::TermQuery;
pub use self::weight::Weight;

View File

@@ -1,104 +0,0 @@
use combine::*;
use combine::char::*;
use query::{Query, MultiTermQuery};
use schema::{Schema, FieldType, Term, Field};
use analyzer::SimpleTokenizer;
use analyzer::StreamingIterator;
use query::Occur;
/// Possible error that may happen when parsing a query.
#[derive(Debug)]
pub enum ParsingError {
/// Error in the query syntax
SyntaxError,
/// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema
FieldDoesNotExist(String),
/// `ExpectedU32(field_name: String, field_value: String)`
/// The query contains a term for a `u32`-field, but the value
/// is not a u32.
ExpectedU32(String, String),
}
/// Tantivy's Query parser
///
/// The language covered by the current parser is extremely simple.
///
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
/// The terms are then searched within the default terms of the query parser.
///
/// e.g. If `body` and `title` are default fields, our example terms are
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
/// By default, all tokenized and indexed fields are default fields.
///
/// Multiple terms are handled as an `OR` : any document containing at least
/// one of the term will go through the scoring.
///
/// This behavior is slower, but is not a bad idea if the user is sorting
/// by relevance : The user typically just scans through the first few
/// documents in order of decreasing relevance and will stop when the documents
/// are not relevant anymore.
/// Making it possible to make this behavior customizable is tracked in
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
///
/// * negative terms: By prepending a term by a `-`, a term can be excluded
/// from the search. This is useful for disambiguating a query.
/// e.g. `apple -fruit`
///
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
///
pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
}
impl QueryParser {
/// Creates a `QueryParser`
/// * schema - index Schema
/// * default_fields - fields used to search if no field is specifically defined
/// in the query.
pub fn new(schema: Schema,
default_fields: Vec<Field>) -> QueryParser {
QueryParser {
schema: schema,
default_fields: default_fields,
}
}
/// Parse a query
///
/// Note that `parse_query` returns an error if the input
/// is not a valid query.
///
/// There is currently no lenient mode for the query parser
/// which makes it a bad choice for a public/broad user search engine.
///
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query<I>(&self, query: I) -> Result<Box<Query>, ParsingError> where I: Stream<Item = char> {
panic!("a");
}
}
#[derive(Debug, Eq, PartialEq)]
pub enum Literal {
WithField(String, String),
DefaultField(String),
}
pub fn query_language<I>(input: I) -> ParseResult<Vec<(Occur, Literal)>, I>
where I: Stream<Item = char>
{
panic!("a");
}

View File

@@ -16,7 +16,7 @@ pub enum LogicalAST{
fn occur_letter(occur: Occur) -> &'static str {
match occur {
Occur::Must => "'+",
Occur::Must => "+",
Occur::MustNot => "-",
Occur::Should => "",
}
@@ -31,10 +31,11 @@ impl fmt::Debug for LogicalAST {
}
else {
let (ref occur, ref subquery) = clause[0];
try!(write!(formatter, "{}{:?}", occur_letter(*occur), subquery));
try!(write!(formatter, "({}{:?}", occur_letter(*occur), subquery));
for &(ref occur, ref subquery) in &clause[1..] {
try!(write!(formatter, "{}{:?}", occur_letter(*occur), subquery));
try!(write!(formatter, " {}{:?}", occur_letter(*occur), subquery));
}
try!(formatter.write_str(")"));
}
Ok(())
}
@@ -58,7 +59,6 @@ impl fmt::Debug for LogicalLiteral {
write!(formatter, "{:?}", term)
},
LogicalLiteral::Phrase(ref terms) => {
// write!(formatter, "\"{}\"", literal)
write!(formatter, "\"{:?}\"", terms)
}
}

View File

@@ -0,0 +1,7 @@
mod query_parser;
mod query_grammar;
mod user_input_ast;
mod logical_ast;
pub use self::query_parser::QueryParser;
pub use self::query_parser::QueryParserError;

View File

@@ -1,7 +1,6 @@
use combine::*;
use combine::char::*;
use super::user_input_ast::*;
use schema::{Schema, Field};
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char> {
@@ -61,7 +60,6 @@ pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
#[cfg(test)]
mod test {
use combine::*;
use super::*;
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {

View File

@@ -4,11 +4,9 @@ use query::BooleanQuery;
use super::logical_ast::*;
use super::user_input_ast::*;
use super::query_grammar::parse_to_ast;
use super::boolean_operator::BooleanOperator;
use query::Occur;
use query::TermQuery;
use query::PhraseQuery;
use combine::ParseError;
use analyzer::SimpleTokenizer;
use analyzer::StreamingIterator;
use schema::Term;
@@ -16,7 +14,7 @@ use schema::Term;
/// Possible error that may happen when parsing a query.
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq)]
pub enum QueryParserError {
/// Error in the query syntax
SyntaxError,
@@ -66,7 +64,7 @@ pub enum QueryParserError {
pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
default_operator: BooleanOperator,
conjunction_by_default: bool,
analyzer: Box<SimpleTokenizer>,
}
@@ -80,11 +78,15 @@ impl QueryParser {
QueryParser {
schema: schema,
default_fields: default_fields,
default_operator: BooleanOperator::And,
conjunction_by_default: false,
analyzer: box SimpleTokenizer,
}
}
pub fn set_conjunction_by_default(&mut self) {
self.conjunction_by_default = true;
}
/// Parse a query
///
/// Note that `parse_query` returns an error if the input
@@ -96,10 +98,14 @@ impl QueryParser {
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
let (user_input_ast, remaining) = try!(parse_to_ast(query).map_err(|e| QueryParserError::SyntaxError));
let logical_ast = try!(self.compute_logical_ast(user_input_ast));
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(logical_ast))
}
pub fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, remaining) = parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema.get_field(field_name)
@@ -139,12 +145,28 @@ impl QueryParser {
}
}
fn default_occur(&self) -> Occur {
if self.conjunction_by_default {
Occur::Must
}
else {
Occur::Should
}
}
pub fn compute_logical_ast_with_occur(&self, user_input_ast: UserInputAST) -> Result<(Occur, LogicalAST), QueryParserError> {
match user_input_ast {
UserInputAST::Clause(sub_queries) => {
let default_occur = self.default_occur();
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries
.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res|
res.map(
|(occur, sub_ast)| (default_occur.compose(occur), sub_ast)
)
)
.collect());
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
@@ -230,3 +252,50 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
}
}
#[cfg(test)]
mod test {
use schema::{SchemaBuilder, TEXT};
use super::QueryParser;
use super::QueryParserError;
use super::super::logical_ast::*;
fn parse_query_to_logical_ast(query: &str, default_conjunction: bool) -> Result<LogicalAST, QueryParserError> {
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let default_fields = vec!(title, text);
let mut query_parser = QueryParser::new(schema, default_fields);
if default_conjunction {
query_parser.set_conjunction_by_default();
}
query_parser.parse_query_to_logical_ast(query)
}
fn test_parse_query_to_logical_ast_helper(query: &str, expected: &str, default_conjunction: bool) {
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
}
#[test]
pub fn test_parse_query_to_ast_disjunction() {
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", false);
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", false);
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(), QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b", "(Term([0, 97]) (Term([0, 98]) Term([1, 98])))", false);
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", false);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", true);
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(), QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b", "(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))", true);
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", true);
}
}

View File

@@ -1,4 +0,0 @@
pub enum BooleanOperator {
And,
Or,
}

View File

@@ -1,5 +0,0 @@
mod query_parser;
mod query_grammar;
mod user_input_ast;
mod logical_ast;
mod boolean_operator;