Merge branch 'master' of github.com:tantivy-search/tantivy

This commit is contained in:
Paul Masurel
2018-06-27 16:58:47 +09:00
5 changed files with 233 additions and 26 deletions

View File

@@ -1,11 +1,16 @@
use query::Occur;
use schema::Field;
use schema::Term;
use std::fmt;
use std::ops::Bound;
use schema::Type;
#[derive(Clone)]
pub enum LogicalLiteral {
Term(Term),
Phrase(Vec<Term>),
Range { field: Field, value_type: Type, lower: Bound<Term>, upper: Bound<Term> },
All,
}
#[derive(Clone)]
@@ -54,6 +59,8 @@ impl fmt::Debug for LogicalLiteral {
match *self {
LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
LogicalLiteral::Range { ref lower, ref upper, .. } => write!(formatter, "({:?} TO {:?})", lower, upper),
LogicalLiteral::All => write!(formatter, "*"),
}
}
}

View File

@@ -1,29 +1,36 @@
use super::user_input_ast::*;
use combine::char::*;
use combine::*;
use query::query_parser::user_input_ast::UserInputBound;
fn field<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
(letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
}
fn word<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
many1(satisfy(|c: char| c.is_alphanumeric()))
}
fn negative_number<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
}
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where
I: Stream<Item = char>,
{
let term_val = || {
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
phrase.or(word)
phrase.or(word())
};
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let field = (
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let term_val_with_field = negative_numbers.or(term_val());
let term_val_with_field = negative_number().or(term_val());
let term_query =
(field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
(field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
field_name: Some(field_name),
phrase,
});
@@ -37,6 +44,26 @@ where
.parse_stream(input)
}
fn range<I: Stream<Item = char>>(input: I) -> ParseResult<UserInputAST, I> {
let term_val = || {
word().or(negative_number())
};
let lower_bound = {
let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w));
let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w));
excl.or(incl)
};
let upper_bound = {
let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w));
let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w));
// TODO: this backtracking should be unnecessary
try(excl).or(incl)
};
(optional((field(), char(':')).map(|x| x.0)), lower_bound, spaces(), string("TO"), spaces(), upper_bound)
.map(|(field, lower, _, _, _, upper)| UserInputAST::Range { field, lower, upper })
.parse_stream(input)
}
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where
I: Stream<Item = char>,
@@ -45,6 +72,8 @@ where
.map(|(_, expr)| UserInputAST::Not(Box::new(expr)))
.or((char('+'), parser(leaf)).map(|(_, expr)| UserInputAST::Must(Box::new(expr))))
.or((char('('), parser(parse_to_ast), char(')')).map(|(_, expr, _)| expr))
.or(char('*').map(|_| UserInputAST::All))
.or(try(parser(range)))
.or(parser(literal))
.parse_stream(input)
}
@@ -91,6 +120,10 @@ mod test {
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
test_is_parse_err("abc + ");
}
}

View File

@@ -13,6 +13,10 @@ use schema::{FieldType, Term};
use std::num::ParseIntError;
use std::str::FromStr;
use tokenizer::TokenizerManager;
use std::ops::Bound;
use query::RangeQuery;
use query::AllQuery;
use std::borrow::Cow;
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq)]
@@ -39,6 +43,9 @@ pub enum QueryParserError {
/// The tokenizer for the given field is unknown
/// The two argument strings are the name of the field, the name of the tokenizer
UnknownTokenizer(String, String),
/// The query contains a range query with a phrase as one of the bounds.
/// Only terms can be used as bounds.
RangeMustNotHavePhrase,
}
impl From<ParseIntError> for QueryParserError {
@@ -66,8 +73,8 @@ impl From<ParseIntError> for QueryParserError {
/// by relevance : The user typically just scans through the first few
/// documents in order of decreasing relevance and will stop when the documents
/// are not relevant anymore.
/// Making it possible to make this behavior customizable is tracked in
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
///
/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
///
/// * negative terms: By prepending a term by a `-`, a term can be excluded
/// from the search. This is useful for disambiguating a query.
@@ -75,6 +82,17 @@ impl From<ParseIntError> for QueryParserError {
///
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
///
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed.
/// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed
/// by "obama".
///
/// * range terms: Range searches can be done by specifying the start and end bound. These can be
/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains
/// a word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
/// Inclusive bounds are `[]`, exclusive are `{}`.
///
/// * all docs query: A plain `*` will match all documents in the index.
///
pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
@@ -155,11 +173,12 @@ impl QueryParser {
}
Ok(ast)
}
fn compute_logical_ast_for_leaf(
fn compute_terms_for_string(
&self,
field: Field,
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
phrase: &str
) -> Result<Vec<Term>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
if !field_type.is_indexed() {
@@ -170,12 +189,12 @@ impl QueryParser {
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
let term = Term::from_field_i64(field, val);
Ok(Some(LogicalLiteral::Term(term)))
Ok(vec![term])
}
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
let term = Term::from_field_u64(field, val);
Ok(Some(LogicalLiteral::Term(term)))
Ok(vec![term])
}
FieldType::Str(ref str_options) => {
if let Some(option) = str_options.get_indexing_options() {
@@ -194,17 +213,15 @@ impl QueryParser {
terms.push(term);
});
if terms.is_empty() {
Ok(None)
Ok(vec![])
} else if terms.len() == 1 {
Ok(Some(LogicalLiteral::Term(
terms.into_iter().next().unwrap(),
)))
Ok(terms)
} else {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
if let Some(index_record_option) = field_type.get_index_record_option() {
if index_record_option.has_positions() {
Ok(Some(LogicalLiteral::Phrase(terms)))
Ok(terms)
} else {
let fieldname = self.schema.get_field_name(field).to_string();
Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
@@ -224,8 +241,7 @@ impl QueryParser {
}
}
FieldType::HierarchicalFacet => {
let term = Term::from_field_text(field, phrase);
Ok(Some(LogicalLiteral::Term(term)))
Ok(vec![Term::from_field_text(field, phrase)])
}
FieldType::Bytes => {
let field_name = self.schema.get_field_name(field).to_string();
@@ -234,6 +250,19 @@ impl QueryParser {
}
}
fn compute_logical_ast_for_leaf(
&self,
field: Field,
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let terms = self.compute_terms_for_string(field, phrase)?;
match terms.len() {
0 => Ok(None),
1 => Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))),
_ => Ok(Some(LogicalLiteral::Phrase(terms))),
}
}
fn default_occur(&self) -> Occur {
if self.conjunction_by_default {
Occur::Must
@@ -242,6 +271,31 @@ impl QueryParser {
}
}
fn resolve_bound(&self, field: Field, bound: &UserInputBound) -> Result<Bound<Term>, QueryParserError> {
let terms = self.compute_terms_for_string(field, bound.term_str())?;
if terms.len() != 1 {
return Err(QueryParserError::RangeMustNotHavePhrase)
}
let term = terms.into_iter().next().unwrap();
match *bound {
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
}
}
fn resolved_fields(&self, given_field: &Option<String>) -> Result<Cow<[Field]>, QueryParserError> {
match *given_field {
None => {
if self.default_fields.is_empty() {
Err(QueryParserError::NoDefaultFieldDeclared)
} else {
Ok(Cow::from(&self.default_fields[..]))
}
},
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
}
}
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAST,
@@ -265,6 +319,28 @@ impl QueryParser {
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
}
UserInputAST::Range { field, lower, upper } => {
let fields = self.resolved_fields(&field)?;
let mut clauses = fields.iter().map(|&field| {
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
field,
value_type,
lower: self.resolve_bound(field, &lower)?,
upper: self.resolve_bound(field, &upper)?,
})))
}).collect::<Result<Vec<_>, QueryParserError>>()?;
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
LogicalAST::Clause(clauses.into_iter().map(|clause| (Occur::Should, clause)).collect())
};
Ok((Occur::Should, result_ast))
}
UserInputAST::All => {
Ok((Occur::Should, LogicalAST::Leaf(Box::new(LogicalLiteral::All))))
}
UserInputAST::Leaf(literal) => {
let term_phrases: Vec<(Field, String)> = match literal.field_name {
Some(ref field_name) => {
@@ -327,6 +403,10 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
match logical_literal {
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
LogicalLiteral::Phrase(terms) => Box::new(PhraseQuery::new(terms)),
LogicalLiteral::Range { field, value_type, lower, upper } => {
Box::new(RangeQuery::new_term_bounds(field, value_type, lower, upper))
},
LogicalLiteral::All => Box::new(AllQuery),
}
}
@@ -511,6 +591,31 @@ mod test {
Term([0, 0, 0, 0, 98])]\"",
false,
);
test_parse_query_to_logical_ast_helper(
"title:[a TO b]",
"(Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term([0, 0, 0, 0, 98])))",
false,
);
test_parse_query_to_logical_ast_helper(
"[a TO b]",
"((Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term([0, 0, 0, 0, 98]))) \
(Included(Term([0, 0, 0, 1, 97])) TO \
Included(Term([0, 0, 0, 1, 98]))))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO toto}",
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
false,
);
test_parse_query_to_logical_ast_helper(
"*",
"*",
false,
);
}
#[test]

View File

@@ -14,10 +14,40 @@ impl fmt::Debug for UserInputLiteral {
}
}
pub enum UserInputBound {
Inclusive(String),
Exclusive(String),
}
impl UserInputBound {
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
}
}
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
}
}
pub fn term_str(&self) -> &str {
match *self {
UserInputBound::Inclusive(ref contents) => contents,
UserInputBound::Exclusive(ref contents) => contents,
}
}
}
pub enum UserInputAST {
Clause(Vec<Box<UserInputAST>>),
Not(Box<UserInputAST>),
Must(Box<UserInputAST>),
Range { field: Option<String>, lower: UserInputBound, upper: UserInputBound },
All,
Leaf(Box<UserInputLiteral>),
}
@@ -45,6 +75,16 @@ impl fmt::Debug for UserInputAST {
Ok(())
}
UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery),
UserInputAST::Range { ref field, ref lower, ref upper } => {
if let &Some(ref field) = field {
write!(formatter, "{}:", field)?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
upper.display_upper(formatter)?;
Ok(())
},
UserInputAST::All => write!(formatter, "*"),
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
}
}

View File

@@ -89,6 +89,28 @@ pub struct RangeQuery {
}
impl RangeQuery {
/// Creates a new `RangeQuery` from bounded start and end terms.
///
/// If the value type is not correct, something may go terribly wrong when
/// the `Weight` object is created.
pub fn new_term_bounds(
field: Field,
value_type: Type,
left_bound: Bound<Term>,
right_bound: Bound<Term>
) -> RangeQuery {
let verify_and_unwrap_term = |val: &Term| {
assert_eq!(field, val.field());
val.value_bytes().to_owned()
};
RangeQuery {
field,
value_type,
left_bound: map_bound(&left_bound, &verify_and_unwrap_term),
right_bound: map_bound(&right_bound, &verify_and_unwrap_term),
}
}
/// Creates a new `RangeQuery` over a `i64` field.
///
/// If the field is not of the type `i64`, tantivy