mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 09:00:42 +00:00
Merge branch 'master' of github.com:tantivy-search/tantivy
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
use query::Occur;
|
||||
use schema::Field;
|
||||
use schema::Term;
|
||||
use std::fmt;
|
||||
use std::ops::Bound;
|
||||
use schema::Type;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<Term>),
|
||||
Range { field: Field, value_type: Type, lower: Bound<Term>, upper: Bound<Term> },
|
||||
All,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -54,6 +59,8 @@ impl fmt::Debug for LogicalLiteral {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
|
||||
LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
|
||||
LogicalLiteral::Range { ref lower, ref upper, .. } => write!(formatter, "({:?} TO {:?})", lower, upper),
|
||||
LogicalLiteral::All => write!(formatter, "*"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,29 +1,36 @@
|
||||
use super::user_input_ast::*;
|
||||
use combine::char::*;
|
||||
use combine::*;
|
||||
use query::query_parser::user_input_ast::UserInputBound;
|
||||
|
||||
fn field<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
|
||||
(letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
|
||||
fn word<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
|
||||
many1(satisfy(|c: char| c.is_alphanumeric()))
|
||||
}
|
||||
|
||||
|
||||
fn negative_number<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
|
||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
|
||||
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
let term_val = || {
|
||||
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
|
||||
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
|
||||
phrase.or(word)
|
||||
phrase.or(word())
|
||||
};
|
||||
|
||||
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let field = (
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let term_val_with_field = negative_numbers.or(term_val());
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
|
||||
let term_query =
|
||||
(field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
|
||||
(field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
});
|
||||
@@ -37,6 +44,26 @@ where
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
fn range<I: Stream<Item = char>>(input: I) -> ParseResult<UserInputAST, I> {
|
||||
let term_val = || {
|
||||
word().or(negative_number())
|
||||
};
|
||||
let lower_bound = {
|
||||
let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w));
|
||||
let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w));
|
||||
excl.or(incl)
|
||||
};
|
||||
let upper_bound = {
|
||||
let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w));
|
||||
let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w));
|
||||
// TODO: this backtracking should be unnecessary
|
||||
try(excl).or(incl)
|
||||
};
|
||||
(optional((field(), char(':')).map(|x| x.0)), lower_bound, spaces(), string("TO"), spaces(), upper_bound)
|
||||
.map(|(field, lower, _, _, _, upper)| UserInputAST::Range { field, lower, upper })
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
@@ -45,6 +72,8 @@ where
|
||||
.map(|(_, expr)| UserInputAST::Not(Box::new(expr)))
|
||||
.or((char('+'), parser(leaf)).map(|(_, expr)| UserInputAST::Must(Box::new(expr))))
|
||||
.or((char('('), parser(parse_to_ast), char(')')).map(|(_, expr, _)| expr))
|
||||
.or(char('*').map(|_| UserInputAST::All))
|
||||
.or(try(parser(range)))
|
||||
.or(parser(literal))
|
||||
.parse_stream(input)
|
||||
}
|
||||
@@ -91,6 +120,10 @@ mod test {
|
||||
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,10 @@ use schema::{FieldType, Term};
|
||||
use std::num::ParseIntError;
|
||||
use std::str::FromStr;
|
||||
use tokenizer::TokenizerManager;
|
||||
use std::ops::Bound;
|
||||
use query::RangeQuery;
|
||||
use query::AllQuery;
|
||||
use std::borrow::Cow;
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
@@ -39,6 +43,9 @@ pub enum QueryParserError {
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
UnknownTokenizer(String, String),
|
||||
/// The query contains a range query with a phrase as one of the bounds.
|
||||
/// Only terms can be used as bounds.
|
||||
RangeMustNotHavePhrase,
|
||||
}
|
||||
|
||||
impl From<ParseIntError> for QueryParserError {
|
||||
@@ -66,8 +73,8 @@ impl From<ParseIntError> for QueryParserError {
|
||||
/// by relevance : The user typically just scans through the first few
|
||||
/// documents in order of decreasing relevance and will stop when the documents
|
||||
/// are not relevant anymore.
|
||||
/// Making it possible to make this behavior customizable is tracked in
|
||||
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
|
||||
///
|
||||
/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
|
||||
///
|
||||
/// * negative terms: By prepending a term by a `-`, a term can be excluded
|
||||
/// from the search. This is useful for disambiguating a query.
|
||||
@@ -75,6 +82,17 @@ impl From<ParseIntError> for QueryParserError {
|
||||
///
|
||||
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
|
||||
///
|
||||
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed.
|
||||
/// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed
|
||||
/// by "obama".
|
||||
///
|
||||
/// * range terms: Range searches can be done by specifying the start and end bound. These can be
|
||||
/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains
|
||||
/// a word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
|
||||
/// Inclusive bounds are `[]`, exclusive are `{}`.
|
||||
///
|
||||
/// * all docs query: A plain `*` will match all documents in the index.
|
||||
///
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
@@ -155,11 +173,12 @@ impl QueryParser {
|
||||
}
|
||||
Ok(ast)
|
||||
}
|
||||
fn compute_logical_ast_for_leaf(
|
||||
|
||||
fn compute_terms_for_string(
|
||||
&self,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
phrase: &str
|
||||
) -> Result<Vec<Term>, QueryParserError> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
if !field_type.is_indexed() {
|
||||
@@ -170,12 +189,12 @@ impl QueryParser {
|
||||
FieldType::I64(_) => {
|
||||
let val: i64 = i64::from_str(phrase)?;
|
||||
let term = Term::from_field_i64(field, val);
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
Ok(vec![term])
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
let term = Term::from_field_u64(field, val);
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
Ok(vec![term])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
if let Some(option) = str_options.get_indexing_options() {
|
||||
@@ -194,17 +213,15 @@ impl QueryParser {
|
||||
terms.push(term);
|
||||
});
|
||||
if terms.is_empty() {
|
||||
Ok(None)
|
||||
Ok(vec![])
|
||||
} else if terms.len() == 1 {
|
||||
Ok(Some(LogicalLiteral::Term(
|
||||
terms.into_iter().next().unwrap(),
|
||||
)))
|
||||
Ok(terms)
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
if let Some(index_record_option) = field_type.get_index_record_option() {
|
||||
if index_record_option.has_positions() {
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
Ok(terms)
|
||||
} else {
|
||||
let fieldname = self.schema.get_field_name(field).to_string();
|
||||
Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
|
||||
@@ -224,8 +241,7 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet => {
|
||||
let term = Term::from_field_text(field, phrase);
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
Ok(vec![Term::from_field_text(field, phrase)])
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
let field_name = self.schema.get_field_name(field).to_string();
|
||||
@@ -234,6 +250,19 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_logical_ast_for_leaf(
|
||||
&self,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let terms = self.compute_terms_for_string(field, phrase)?;
|
||||
match terms.len() {
|
||||
0 => Ok(None),
|
||||
1 => Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))),
|
||||
_ => Ok(Some(LogicalLiteral::Phrase(terms))),
|
||||
}
|
||||
}
|
||||
|
||||
fn default_occur(&self) -> Occur {
|
||||
if self.conjunction_by_default {
|
||||
Occur::Must
|
||||
@@ -242,6 +271,31 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_bound(&self, field: Field, bound: &UserInputBound) -> Result<Bound<Term>, QueryParserError> {
|
||||
let terms = self.compute_terms_for_string(field, bound.term_str())?;
|
||||
if terms.len() != 1 {
|
||||
return Err(QueryParserError::RangeMustNotHavePhrase)
|
||||
}
|
||||
let term = terms.into_iter().next().unwrap();
|
||||
match *bound {
|
||||
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
|
||||
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
|
||||
}
|
||||
}
|
||||
|
||||
fn resolved_fields(&self, given_field: &Option<String>) -> Result<Cow<[Field]>, QueryParserError> {
|
||||
match *given_field {
|
||||
None => {
|
||||
if self.default_fields.is_empty() {
|
||||
Err(QueryParserError::NoDefaultFieldDeclared)
|
||||
} else {
|
||||
Ok(Cow::from(&self.default_fields[..]))
|
||||
}
|
||||
},
|
||||
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_logical_ast_with_occur(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
@@ -265,6 +319,28 @@ impl QueryParser {
|
||||
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Range { field, lower, upper } => {
|
||||
let fields = self.resolved_fields(&field)?;
|
||||
let mut clauses = fields.iter().map(|&field| {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
lower: self.resolve_bound(field, &lower)?,
|
||||
upper: self.resolve_bound(field, &upper)?,
|
||||
})))
|
||||
}).collect::<Result<Vec<_>, QueryParserError>>()?;
|
||||
let result_ast = if clauses.len() == 1 {
|
||||
clauses.pop().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(clauses.into_iter().map(|clause| (Occur::Should, clause)).collect())
|
||||
};
|
||||
Ok((Occur::Should, result_ast))
|
||||
}
|
||||
UserInputAST::All => {
|
||||
Ok((Occur::Should, LogicalAST::Leaf(Box::new(LogicalLiteral::All))))
|
||||
}
|
||||
UserInputAST::Leaf(literal) => {
|
||||
let term_phrases: Vec<(Field, String)> = match literal.field_name {
|
||||
Some(ref field_name) => {
|
||||
@@ -327,6 +403,10 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
match logical_literal {
|
||||
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
|
||||
LogicalLiteral::Phrase(terms) => Box::new(PhraseQuery::new(terms)),
|
||||
LogicalLiteral::Range { field, value_type, lower, upper } => {
|
||||
Box::new(RangeQuery::new_term_bounds(field, value_type, lower, upper))
|
||||
},
|
||||
LogicalLiteral::All => Box::new(AllQuery),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -511,6 +591,31 @@ mod test {
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
"(Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"[a TO b]",
|
||||
"((Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98]))) \
|
||||
(Included(Term([0, 0, 0, 1, 97])) TO \
|
||||
Included(Term([0, 0, 0, 1, 98]))))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO toto}",
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
|
||||
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"*",
|
||||
"*",
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -14,10 +14,40 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum UserInputBound {
|
||||
Inclusive(String),
|
||||
Exclusive(String),
|
||||
}
|
||||
|
||||
impl UserInputBound {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
|
||||
}
|
||||
}
|
||||
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn term_str(&self) -> &str {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref contents) => contents,
|
||||
UserInputBound::Exclusive(ref contents) => contents,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<Box<UserInputAST>>),
|
||||
Not(Box<UserInputAST>),
|
||||
Must(Box<UserInputAST>),
|
||||
Range { field: Option<String>, lower: UserInputBound, upper: UserInputBound },
|
||||
All,
|
||||
Leaf(Box<UserInputLiteral>),
|
||||
}
|
||||
|
||||
@@ -45,6 +75,16 @@ impl fmt::Debug for UserInputAST {
|
||||
Ok(())
|
||||
}
|
||||
UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery),
|
||||
UserInputAST::Range { ref field, ref lower, ref upper } => {
|
||||
if let &Some(ref field) = field {
|
||||
write!(formatter, "{}:", field)?;
|
||||
}
|
||||
lower.display_lower(formatter)?;
|
||||
write!(formatter, " TO ")?;
|
||||
upper.display_upper(formatter)?;
|
||||
Ok(())
|
||||
},
|
||||
UserInputAST::All => write!(formatter, "*"),
|
||||
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,6 +89,28 @@ pub struct RangeQuery {
|
||||
}
|
||||
|
||||
impl RangeQuery {
|
||||
/// Creates a new `RangeQuery` from bounded start and end terms.
|
||||
///
|
||||
/// If the value type is not correct, something may go terribly wrong when
|
||||
/// the `Weight` object is created.
|
||||
pub fn new_term_bounds(
|
||||
field: Field,
|
||||
value_type: Type,
|
||||
left_bound: Bound<Term>,
|
||||
right_bound: Bound<Term>
|
||||
) -> RangeQuery {
|
||||
let verify_and_unwrap_term = |val: &Term| {
|
||||
assert_eq!(field, val.field());
|
||||
val.value_bytes().to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type,
|
||||
left_bound: map_bound(&left_bound, &verify_and_unwrap_term),
|
||||
right_bound: map_bound(&right_bound, &verify_and_unwrap_term),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new `RangeQuery` over a `i64` field.
|
||||
///
|
||||
/// If the field is not of the type `i64`, tantivy
|
||||
|
||||
Reference in New Issue
Block a user