mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 10:02:55 +00:00
344 lines
14 KiB
Rust
344 lines
14 KiB
Rust
use schema::{Schema, Field};
|
|
use query::Query;
|
|
use query::BooleanQuery;
|
|
use super::logical_ast::*;
|
|
use super::user_input_ast::*;
|
|
use super::query_grammar::parse_to_ast;
|
|
use query::Occur;
|
|
use query::TermQuery;
|
|
use postings::SegmentPostingsOption;
|
|
use query::PhraseQuery;
|
|
use analyzer::SimpleTokenizer;
|
|
use analyzer::StreamingIterator;
|
|
use schema::Term;
|
|
|
|
|
|
|
|
/// Possible error that may happen when parsing a query.
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
pub enum QueryParserError {
|
|
/// Error in the query syntax
|
|
SyntaxError,
|
|
/// `FieldDoesNotExist(field_name: String)`
|
|
/// The query references a field that is not in the schema
|
|
FieldDoesNotExist(String),
|
|
/// `ExpectedU32(field_name: String, field_value: String)`
|
|
/// The query contains a term for a `u32`-field, but the value
|
|
/// is not a u32.
|
|
ExpectedU32(String, String),
|
|
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
|
|
AllButQueryForbidden,
|
|
/// If no default field is declared, running a query without any
|
|
/// field specified is forbbidden.
|
|
NoDefaultFieldDeclared,
|
|
}
|
|
|
|
|
|
|
|
/// Tantivy's Query parser
|
|
///
|
|
/// The language covered by the current parser is extremely simple.
|
|
///
|
|
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
|
|
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
|
|
/// The terms are then searched within the default terms of the query parser.
|
|
///
|
|
/// e.g. If `body` and `title` are default fields, our example terms are
|
|
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
|
|
/// By default, all tokenized and indexed fields are default fields.
|
|
///
|
|
/// Multiple terms are handled as an `OR` : any document containing at least
|
|
/// one of the term will go through the scoring.
|
|
///
|
|
/// This behavior is slower, but is not a bad idea if the user is sorting
|
|
/// by relevance : The user typically just scans through the first few
|
|
/// documents in order of decreasing relevance and will stop when the documents
|
|
/// are not relevant anymore.
|
|
/// Making it possible to make this behavior customizable is tracked in
|
|
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
|
|
///
|
|
/// * negative terms: By prepending a term by a `-`, a term can be excluded
|
|
/// from the search. This is useful for disambiguating a query.
|
|
/// e.g. `apple -fruit`
|
|
///
|
|
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
|
|
///
|
|
pub struct QueryParser {
|
|
schema: Schema,
|
|
default_fields: Vec<Field>,
|
|
conjunction_by_default: bool,
|
|
analyzer: Box<SimpleTokenizer>,
|
|
}
|
|
|
|
impl QueryParser {
|
|
/// Creates a `QueryParser`
|
|
/// * schema - index Schema
|
|
/// * default_fields - fields used to search if no field is specifically defined
|
|
/// in the query.
|
|
pub fn new(schema: Schema, default_fields: Vec<Field>) -> QueryParser {
|
|
QueryParser {
|
|
schema: schema,
|
|
default_fields: default_fields,
|
|
conjunction_by_default: false,
|
|
analyzer: box SimpleTokenizer,
|
|
}
|
|
}
|
|
|
|
/// Set the default way to compose queries to a conjunction.
|
|
///
|
|
/// By default a ,
|
|
pub fn set_conjunction_by_default(&mut self) {
|
|
self.conjunction_by_default = true;
|
|
}
|
|
|
|
/// Parse a query
|
|
///
|
|
/// Note that `parse_query` returns an error if the input
|
|
/// is not a valid query.
|
|
///
|
|
/// There is currently no lenient mode for the query parser
|
|
/// which makes it a bad choice for a public/broad user search engine.
|
|
///
|
|
/// Implementing a lenient mode for this query parser is tracked
|
|
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
|
|
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
|
|
let logical_ast = self.parse_query_to_logical_ast(query)?;
|
|
Ok(convert_to_query(logical_ast))
|
|
}
|
|
|
|
/// Parse the user query into an AST.
|
|
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
|
let (user_input_ast, _remaining) =
|
|
parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
|
|
self.compute_logical_ast(user_input_ast)
|
|
}
|
|
|
|
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
|
self.schema
|
|
.get_field(field_name)
|
|
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
|
|
}
|
|
|
|
fn compute_logical_ast(&self,
|
|
user_input_ast: UserInputAST)
|
|
-> Result<LogicalAST, QueryParserError> {
|
|
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
|
|
if occur == Occur::MustNot {
|
|
return Err(QueryParserError::AllButQueryForbidden);
|
|
}
|
|
Ok(ast)
|
|
}
|
|
|
|
fn compute_logical_ast_for_leaf(&self,
|
|
field: Field,
|
|
phrase: &str)
|
|
-> Result<Option<LogicalLiteral>, QueryParserError> {
|
|
let mut token_iter = self.analyzer.tokenize(phrase);
|
|
let mut tokens: Vec<Term> = Vec::new();
|
|
loop {
|
|
if let Some(token) = token_iter.next() {
|
|
let text = token.to_string();
|
|
// TODO Handle u32
|
|
let term = Term::from_field_text(field, &text);
|
|
tokens.push(term);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if tokens.is_empty() {
|
|
Ok(None)
|
|
} else if tokens.len() == 1 {
|
|
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
|
|
} else {
|
|
Ok(Some(LogicalLiteral::Phrase(tokens)))
|
|
}
|
|
}
|
|
|
|
fn default_occur(&self) -> Occur {
|
|
if self.conjunction_by_default {
|
|
Occur::Must
|
|
} else {
|
|
Occur::Should
|
|
}
|
|
}
|
|
|
|
fn compute_logical_ast_with_occur(&self,
|
|
user_input_ast: UserInputAST)
|
|
-> Result<(Occur, LogicalAST), QueryParserError> {
|
|
match user_input_ast {
|
|
UserInputAST::Clause(sub_queries) => {
|
|
let default_occur = self.default_occur();
|
|
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
|
|
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
|
.map(|res| {
|
|
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
|
|
})
|
|
.collect());
|
|
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
|
}
|
|
UserInputAST::Not(subquery) => {
|
|
let (occur, logical_sub_queries) =
|
|
try!(self.compute_logical_ast_with_occur(*subquery));
|
|
Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
|
|
}
|
|
UserInputAST::Must(subquery) => {
|
|
let (occur, logical_sub_queries) =
|
|
try!(self.compute_logical_ast_with_occur(*subquery));
|
|
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
|
|
}
|
|
UserInputAST::Leaf(literal) => {
|
|
let term_phrases: Vec<(Field, String)> = match literal.field_name {
|
|
Some(ref field_name) => {
|
|
let field = try!(self.resolve_field_name(&field_name));
|
|
vec![(field, literal.phrase.clone())]
|
|
}
|
|
None => {
|
|
if self.default_fields.len() == 0 {
|
|
return Err(QueryParserError::NoDefaultFieldDeclared);
|
|
} else if self.default_fields.len() == 1 {
|
|
vec![(self.default_fields[0], literal.phrase.clone())]
|
|
} else {
|
|
self.default_fields
|
|
.iter()
|
|
.map(|default_field| (*default_field, literal.phrase.clone()))
|
|
.collect()
|
|
}
|
|
}
|
|
};
|
|
let mut asts: Vec<LogicalAST> = Vec::new();
|
|
for (field, phrase) in term_phrases {
|
|
if let Some(ast) = try!(self.compute_logical_ast_for_leaf(field, &phrase)) {
|
|
asts.push(LogicalAST::Leaf(box ast));
|
|
}
|
|
}
|
|
let result_ast = if asts.len() == 0 {
|
|
panic!("not working");
|
|
} else if asts.len() == 1 {
|
|
asts[0].clone()
|
|
} else {
|
|
LogicalAST::Clause(asts.into_iter()
|
|
.map(|ast| (Occur::Should, ast))
|
|
.collect())
|
|
};
|
|
Ok((Occur::Should, result_ast))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/// Compose two occur values.
|
|
fn compose_occur(left: Occur, right: Occur) -> Occur {
|
|
match left {
|
|
Occur::Should => right,
|
|
Occur::Must => {
|
|
if right == Occur::MustNot {
|
|
Occur::MustNot
|
|
} else {
|
|
Occur::Must
|
|
}
|
|
}
|
|
Occur::MustNot => {
|
|
if right == Occur::MustNot {
|
|
Occur::Must
|
|
} else {
|
|
Occur::MustNot
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
|
match logical_literal {
|
|
LogicalLiteral::Term(term) => box TermQuery::new(term, SegmentPostingsOption::Freq),
|
|
LogicalLiteral::Phrase(terms) => box PhraseQuery::from(terms),
|
|
}
|
|
}
|
|
|
|
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
|
match logical_ast {
|
|
LogicalAST::Clause(clause) => {
|
|
let occur_subqueries = clause.into_iter()
|
|
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
|
.collect::<Vec<_>>();
|
|
box BooleanQuery::from(occur_subqueries)
|
|
}
|
|
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use schema::{SchemaBuilder, TEXT};
|
|
use super::QueryParser;
|
|
use super::QueryParserError;
|
|
use super::super::logical_ast::*;
|
|
|
|
fn parse_query_to_logical_ast(query: &str,
|
|
default_conjunction: bool)
|
|
-> Result<LogicalAST, QueryParserError> {
|
|
let mut schema_builder = SchemaBuilder::default();
|
|
let title = schema_builder.add_text_field("title", TEXT);
|
|
let text = schema_builder.add_text_field("text", TEXT);
|
|
let schema = schema_builder.build();
|
|
let default_fields = vec![title, text];
|
|
let mut query_parser = QueryParser::new(schema, default_fields);
|
|
if default_conjunction {
|
|
query_parser.set_conjunction_by_default();
|
|
}
|
|
query_parser.parse_query_to_logical_ast(query)
|
|
}
|
|
|
|
fn test_parse_query_to_logical_ast_helper(query: &str,
|
|
expected: &str,
|
|
default_conjunction: bool) {
|
|
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
|
|
let query_str = format!("{:?}", query);
|
|
assert_eq!(query_str, expected);
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_parse_query_to_ast_disjunction() {
|
|
test_parse_query_to_logical_ast_helper("title:toto",
|
|
"Term([0, 116, 111, 116, 111])",
|
|
false);
|
|
test_parse_query_to_logical_ast_helper("+title:toto",
|
|
"Term([0, 116, 111, 116, 111])",
|
|
false);
|
|
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
|
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
|
|
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
|
|
false);
|
|
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(),
|
|
QueryParserError::AllButQueryForbidden);
|
|
test_parse_query_to_logical_ast_helper("title:a b",
|
|
"(Term([0, 97]) (Term([0, 98]) Term([1, 98])))",
|
|
false);
|
|
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
|
"\"[Term([0, 97]), Term([0, 98])]\"",
|
|
false);
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_parse_query_to_ast_conjunction() {
|
|
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
|
|
test_parse_query_to_logical_ast_helper("+title:toto",
|
|
"Term([0, 116, 111, 116, 111])",
|
|
true);
|
|
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
|
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
|
|
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
|
|
true);
|
|
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(),
|
|
QueryParserError::AllButQueryForbidden);
|
|
test_parse_query_to_logical_ast_helper("title:a b",
|
|
"(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))",
|
|
true);
|
|
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
|
"\"[Term([0, 97]), Term([0, 98])]\"",
|
|
true);
|
|
}
|
|
}
|