Issue/378 (#392)

* Added failing unit test

* Closes #378. Handling queries that end up empty after going through the analyzer.

* Fixed stop word example
This commit is contained in:
Paul Masurel
2018-09-06 10:11:54 +09:00
committed by GitHub
parent cec9956a01
commit 98c7fbdc6f
2 changed files with 89 additions and 59 deletions

View File

@@ -20,6 +20,7 @@ use std::str::FromStr;
use tokenizer::TokenizerManager;
use combine::Parser;
use query::EmptyQuery;
use query::query_parser::logical_ast::LogicalAST;
/// Possible error that may happen when parsing a query.
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
}
}
/// Recursively remove empty clause from the AST
///
/// Returns `None` iff the `logical_ast` ended up being empty.
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
match logical_ast {
LogicalAST::Clause(children) => {
let trimmed_children = children.into_iter()
.flat_map(|(occur, child)|
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
.collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
Some(LogicalAST::Clause(trimmed_children))
}
},
_ => Some(logical_ast),
}
}
/// Tantivy's Query parser
///
/// The language covered by the current parser is extremely simple.
@@ -369,14 +391,15 @@ impl QueryParser {
asts.push(LogicalAST::Leaf(Box::new(ast)));
}
}
let result_ast = if asts.is_empty() {
// this should never happen
return Err(QueryParserError::SyntaxError);
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
let result_ast: LogicalAST =
if asts.len() == 1 {
asts.into_iter().next().unwrap()
} else {
LogicalAST::Clause(
asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
Ok(result_ast)
}
UserInputLeaf::All => {
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
}
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
match logical_ast {
LogicalAST::Clause(clause) => {
if clause.is_empty() {
Box::new(EmptyQuery)
} else {
let occur_subqueries = clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
Box::new(BooleanQuery::from(occur_subqueries))
}
}
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
match trim_ast(logical_ast) {
Some(LogicalAST::Clause(trimmed_clause)) => {
let occur_subqueries = trimmed_clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
Box::new(BooleanQuery::from(occur_subqueries))
},
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
None => Box::new(EmptyQuery)
}
}
@@ -454,12 +475,17 @@ mod test {
use schema::Field;
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
use tokenizer::SimpleTokenizer;
use tokenizer::TokenizerManager;
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
use Index;
fn make_query_parser() -> QueryParser {
let mut schema_builder = SchemaBuilder::default();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("en_with_stop_words")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
schema_builder.add_i64_field("signed", INT_INDEXED);
@@ -468,9 +494,14 @@ mod test {
schema_builder.add_text_field("notindexed_u64", STORED);
schema_builder.add_text_field("notindexed_i64", STORED);
schema_builder.add_text_field("nottokenized", STRING);
schema_builder.add_text_field("with_stop_words", text_options);
let schema = schema_builder.build();
let default_fields = vec![title, text];
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()]))
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}
@@ -739,6 +770,13 @@ mod test {
);
}
#[test]
pub fn test_query_parser_not_empty_but_no_tokens() {
let query_parser = make_query_parser();
assert!(query_parser.parse_query(" !, ").is_ok());
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper(