mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
issue/57 Switch to the new query parser.
This commit is contained in:
@@ -56,9 +56,6 @@ pub struct SimpleTokenizer;
|
||||
|
||||
|
||||
impl SimpleTokenizer {
|
||||
pub fn new() -> SimpleTokenizer {
|
||||
SimpleTokenizer
|
||||
}
|
||||
|
||||
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
|
||||
TokenIter {
|
||||
@@ -71,7 +68,7 @@ impl SimpleTokenizer {
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let simple_tokenizer = SimpleTokenizer;
|
||||
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
|
||||
assert_eq!(term_reader.next().unwrap(), "hello");
|
||||
assert_eq!(term_reader.next().unwrap(), "happy");
|
||||
@@ -83,7 +80,7 @@ fn test_tokenizer() {
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let simple_tokenizer = SimpleTokenizer;
|
||||
let mut term_reader = simple_tokenizer.tokenize("");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
|
||||
@@ -45,8 +45,8 @@ impl From<io::Error> for Error {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<query::ParsingError> for Error {
|
||||
fn from(parsing_error: query::ParsingError) -> Error {
|
||||
impl From<query::QueryParserError> for Error {
|
||||
fn from(parsing_error: query::QueryParserError) -> Error {
|
||||
Error::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,23 +8,22 @@ mod boolean_query;
|
||||
mod multi_term_query;
|
||||
mod phrase_query;
|
||||
mod scorer;
|
||||
mod query_parser;
|
||||
mod occur;
|
||||
mod weight;
|
||||
mod occur_filter;
|
||||
mod term_query;
|
||||
mod query_parser2;
|
||||
mod query_parser;
|
||||
|
||||
pub use self::occur_filter::OccurFilter;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::occur::Occur;
|
||||
pub use self::query::Query;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::multi_term_query::MultiTermQuery;
|
||||
pub use self::multi_term_query::MultiTermWeight;
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::scorer::EmptyScorer;
|
||||
pub use self::occur_filter::OccurFilter;
|
||||
pub use self::occur::Occur;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::query_parser::ParsingError;
|
||||
pub use self::query::Query;
|
||||
pub use self::scorer::EmptyScorer;
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::weight::Weight;
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
use combine::*;
|
||||
use combine::char::*;
|
||||
use query::{Query, MultiTermQuery};
|
||||
use schema::{Schema, FieldType, Term, Field};
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::StreamingIterator;
|
||||
use query::Occur;
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug)]
|
||||
pub enum ParsingError {
|
||||
/// Error in the query syntax
|
||||
SyntaxError,
|
||||
/// `FieldDoesNotExist(field_name: String)`
|
||||
/// The query references a field that is not in the schema
|
||||
FieldDoesNotExist(String),
|
||||
/// `ExpectedU32(field_name: String, field_value: String)`
|
||||
/// The query contains a term for a `u32`-field, but the value
|
||||
/// is not a u32.
|
||||
ExpectedU32(String, String),
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
/// The language covered by the current parser is extremely simple.
|
||||
///
|
||||
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
|
||||
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
|
||||
/// The terms are then searched within the default terms of the query parser.
|
||||
///
|
||||
/// e.g. If `body` and `title` are default fields, our example terms are
|
||||
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
|
||||
/// By default, all tokenized and indexed fields are default fields.
|
||||
///
|
||||
/// Multiple terms are handled as an `OR` : any document containing at least
|
||||
/// one of the term will go through the scoring.
|
||||
///
|
||||
/// This behavior is slower, but is not a bad idea if the user is sorting
|
||||
/// by relevance : The user typically just scans through the first few
|
||||
/// documents in order of decreasing relevance and will stop when the documents
|
||||
/// are not relevant anymore.
|
||||
/// Making it possible to make this behavior customizable is tracked in
|
||||
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
|
||||
///
|
||||
/// * negative terms: By prepending a term by a `-`, a term can be excluded
|
||||
/// from the search. This is useful for disambiguating a query.
|
||||
/// e.g. `apple -fruit`
|
||||
///
|
||||
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
|
||||
///
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`
|
||||
/// * schema - index Schema
|
||||
/// * default_fields - fields used to search if no field is specifically defined
|
||||
/// in the query.
|
||||
pub fn new(schema: Schema,
|
||||
default_fields: Vec<Field>) -> QueryParser {
|
||||
QueryParser {
|
||||
schema: schema,
|
||||
default_fields: default_fields,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Parse a query
|
||||
///
|
||||
/// Note that `parse_query` returns an error if the input
|
||||
/// is not a valid query.
|
||||
///
|
||||
/// There is currently no lenient mode for the query parser
|
||||
/// which makes it a bad choice for a public/broad user search engine.
|
||||
///
|
||||
/// Implementing a lenient mode for this query parser is tracked
|
||||
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
|
||||
pub fn parse_query<I>(&self, query: I) -> Result<Box<Query>, ParsingError> where I: Stream<Item = char> {
|
||||
panic!("a");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum Literal {
|
||||
WithField(String, String),
|
||||
DefaultField(String),
|
||||
}
|
||||
|
||||
|
||||
pub fn query_language<I>(input: I) -> ParseResult<Vec<(Occur, Literal)>, I>
|
||||
where I: Stream<Item = char>
|
||||
{
|
||||
panic!("a");
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ pub enum LogicalAST{
|
||||
|
||||
fn occur_letter(occur: Occur) -> &'static str {
|
||||
match occur {
|
||||
Occur::Must => "'+",
|
||||
Occur::Must => "+",
|
||||
Occur::MustNot => "-",
|
||||
Occur::Should => "",
|
||||
}
|
||||
@@ -31,10 +31,11 @@ impl fmt::Debug for LogicalAST {
|
||||
}
|
||||
else {
|
||||
let (ref occur, ref subquery) = clause[0];
|
||||
try!(write!(formatter, "{}{:?}", occur_letter(*occur), subquery));
|
||||
try!(write!(formatter, "({}{:?}", occur_letter(*occur), subquery));
|
||||
for &(ref occur, ref subquery) in &clause[1..] {
|
||||
try!(write!(formatter, "{}{:?}", occur_letter(*occur), subquery));
|
||||
try!(write!(formatter, " {}{:?}", occur_letter(*occur), subquery));
|
||||
}
|
||||
try!(formatter.write_str(")"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -58,7 +59,6 @@ impl fmt::Debug for LogicalLiteral {
|
||||
write!(formatter, "{:?}", term)
|
||||
},
|
||||
LogicalLiteral::Phrase(ref terms) => {
|
||||
// write!(formatter, "\"{}\"", literal)
|
||||
write!(formatter, "\"{:?}\"", terms)
|
||||
}
|
||||
}
|
||||
7
src/query/query_parser/mod.rs
Normal file
7
src/query/query_parser/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
mod query_parser;
|
||||
mod query_grammar;
|
||||
mod user_input_ast;
|
||||
mod logical_ast;
|
||||
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
@@ -1,7 +1,6 @@
|
||||
use combine::*;
|
||||
use combine::char::*;
|
||||
use super::user_input_ast::*;
|
||||
use schema::{Schema, Field};
|
||||
|
||||
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char> {
|
||||
@@ -61,7 +60,6 @@ pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use combine::*;
|
||||
use super::*;
|
||||
|
||||
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
|
||||
@@ -4,11 +4,9 @@ use query::BooleanQuery;
|
||||
use super::logical_ast::*;
|
||||
use super::user_input_ast::*;
|
||||
use super::query_grammar::parse_to_ast;
|
||||
use super::boolean_operator::BooleanOperator;
|
||||
use query::Occur;
|
||||
use query::TermQuery;
|
||||
use query::PhraseQuery;
|
||||
use combine::ParseError;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::StreamingIterator;
|
||||
use schema::Term;
|
||||
@@ -16,7 +14,7 @@ use schema::Term;
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum QueryParserError {
|
||||
/// Error in the query syntax
|
||||
SyntaxError,
|
||||
@@ -66,7 +64,7 @@ pub enum QueryParserError {
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
default_operator: BooleanOperator,
|
||||
conjunction_by_default: bool,
|
||||
analyzer: Box<SimpleTokenizer>,
|
||||
}
|
||||
|
||||
@@ -80,11 +78,15 @@ impl QueryParser {
|
||||
QueryParser {
|
||||
schema: schema,
|
||||
default_fields: default_fields,
|
||||
default_operator: BooleanOperator::And,
|
||||
conjunction_by_default: false,
|
||||
analyzer: box SimpleTokenizer,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_conjunction_by_default(&mut self) {
|
||||
self.conjunction_by_default = true;
|
||||
}
|
||||
|
||||
/// Parse a query
|
||||
///
|
||||
/// Note that `parse_query` returns an error if the input
|
||||
@@ -96,10 +98,14 @@ impl QueryParser {
|
||||
/// Implementing a lenient mode for this query parser is tracked
|
||||
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
|
||||
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
|
||||
let (user_input_ast, remaining) = try!(parse_to_ast(query).map_err(|e| QueryParserError::SyntaxError));
|
||||
let logical_ast = try!(self.compute_logical_ast(user_input_ast));
|
||||
let logical_ast = self.parse_query_to_logical_ast(query)?;
|
||||
Ok(convert_to_query(logical_ast))
|
||||
}
|
||||
|
||||
pub fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
let (user_input_ast, remaining) = parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
||||
self.schema.get_field(field_name)
|
||||
@@ -139,12 +145,28 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn default_occur(&self) -> Occur {
|
||||
if self.conjunction_by_default {
|
||||
Occur::Must
|
||||
}
|
||||
else {
|
||||
Occur::Should
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn compute_logical_ast_with_occur(&self, user_input_ast: UserInputAST) -> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries
|
||||
.into_iter()
|
||||
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
||||
.map(|res|
|
||||
res.map(
|
||||
|(occur, sub_ast)| (default_occur.compose(occur), sub_ast)
|
||||
)
|
||||
)
|
||||
.collect());
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
@@ -230,3 +252,50 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use super::super::logical_ast::*;
|
||||
|
||||
fn parse_query_to_logical_ast(query: &str, default_conjunction: bool) -> Result<LogicalAST, QueryParserError> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec!(title, text);
|
||||
let mut query_parser = QueryParser::new(schema, default_fields);
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
}
|
||||
query_parser.parse_query_to_logical_ast(query)
|
||||
}
|
||||
|
||||
fn test_parse_query_to_logical_ast_helper(query: &str, expected: &str, default_conjunction: bool) {
|
||||
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", false);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(), QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b", "(Term([0, 97]) (Term([0, 98]) Term([1, 98])))", false);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 116, 111, 116, 111])", true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi", "(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, 105, 116, 105]) Term([1, 116, 105, 116, 105])))", true);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(), QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b", "(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))", true);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"", "\"[Term([0, 97]), Term([0, 98])]\"", true);
|
||||
}
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
pub enum BooleanOperator {
|
||||
And,
|
||||
Or,
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
mod query_parser;
|
||||
mod query_grammar;
|
||||
mod user_input_ast;
|
||||
mod logical_ast;
|
||||
mod boolean_operator;
|
||||
Reference in New Issue
Block a user