Files
tantivy/src/query/query_parser/query_parser.rs
2022-11-29 20:56:13 +09:00

1572 lines
60 KiB
Rust

use std::collections::HashMap;
use std::net::{AddrParseError, IpAddr};
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::{FromStr, ParseBoolError};
use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use super::logical_ast::*;
use crate::core::Index;
use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
Schema, Term, Type,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTime, Score};
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq, Error)]
pub enum QueryParserError {
/// Error in the query syntax
#[error("Syntax Error: {0}")]
SyntaxError(String),
/// This query is unsupported.
#[error("Unsupported query: {0}")]
UnsupportedQuery(String),
/// The query references a field that is not in the schema
#[error("Field does not exists: '{0}'")]
FieldDoesNotExist(String),
/// The query contains a term for a `u64` or `i64`-field, but the value
/// is neither.
#[error("Expected a valid integer: '{0:?}'")]
ExpectedInt(#[from] ParseIntError),
/// The query contains a term for a bytes field, but the value is not valid
/// base64.
#[error("Expected base64: '{0:?}'")]
ExpectedBase64(#[from] base64::DecodeError),
/// The query contains a term for a `f64`-field, but the value
/// is not a f64.
#[error("Invalid query: Only excluding terms given")]
ExpectedFloat(#[from] ParseFloatError),
/// The query contains a term for a bool field, but the value
/// is not a bool.
#[error("Expected a bool value: '{0:?}'")]
ExpectedBool(#[from] ParseBoolError),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
#[error("Invalid query: Only excluding terms given")]
AllButQueryForbidden,
/// If no default field is declared, running a query without any
/// field specified is forbbidden.
#[error("No default field declared and no field specified in query")]
NoDefaultFieldDeclared,
/// The field searched for is not declared
/// as indexed in the schema.
#[error("The field '{0}' is not declared as indexed")]
FieldNotIndexed(String),
/// A phrase query was requested for a field that does not
/// have any positions indexed.
#[error("The field '{0}' does not have positions indexed")]
FieldDoesNotHavePositionsIndexed(String),
/// The tokenizer for the given field is unknown
/// The two argument strings are the name of the field, the name of the tokenizer
#[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")]
UnknownTokenizer {
/// The name of the tokenizer
tokenizer: String,
/// The field name
field: String,
},
/// The query contains a range query with a phrase as one of the bounds.
/// Only terms can be used as bounds.
#[error("A range query cannot have a phrase as one of the bounds")]
RangeMustNotHavePhrase,
/// The format for the date field is not RFC 3339 compliant.
#[error("The date field has an invalid format")]
DateFormatError(#[from] time::error::Parse),
/// The format for the facet field is invalid.
#[error("The facet field is malformed: {0}")]
FacetFormatError(#[from] FacetParseError),
/// The format for the ip field is invalid.
#[error("The ip field is malformed: {0}")]
IpFormatError(#[from] AddrParseError),
}
/// Recursively remove empty clause from the AST
///
/// Returns `None` if and only if the `logical_ast` ended up being empty.
fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
match logical_ast {
LogicalAst::Clause(children) => {
let trimmed_children = children
.into_iter()
.flat_map(|(occur, child)| {
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
})
.collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
Some(LogicalAst::Clause(trimmed_children))
}
}
_ => Some(logical_ast),
}
}
/// Tantivy's Query parser
///
/// The language covered by the current parser is extremely simple.
///
/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using tantivy's
/// [`SimpleTokenizer`](crate::tokenizer::SimpleTokenizer), hence becoming `["barack", "obama"]`.
/// The terms are then searched within the default terms of the query parser.
///
/// e.g. If `body` and `title` are default fields, our example terms are
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
/// By default, all tokenized and indexed fields are default fields.
///
/// Multiple terms are handled as an `OR` : any document containing at least
/// one of the term will go through the scoring.
///
/// This behavior is slower, but is not a bad idea if the user is sorting
/// by relevance : The user typically just scans through the first few
/// documents in order of decreasing relevance and will stop when the documents
/// are not relevant anymore.
///
/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
///
///
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
/// interpreted
/// as `(a AND b) OR c`.
///
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
/// written (`(+x +y) z`). In addition, these operators can help define "required optional"
/// queries. `(+x y)` matches the same document set as simply `x`, but `y` will help refining the
/// score.
///
/// * negative terms: By prepending a term by a `-`, a term can be excluded from the search. This is
/// useful for disambiguating a query. e.g. `apple -fruit`
///
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
///
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. e.g.,
/// `title:"Barack Obama"` will only find documents that have "barack" immediately followed by
/// "obama".
///
/// * range terms: Range searches can be done by specifying the start and end bound. These can be
/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains a
/// word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
/// Inclusive bounds are `[]`, exclusive are `{}`.
///
/// * date values: The query parser supports rfc3339 formatted dates. For example
/// `"2002-10-02T15:00:00.05Z"` or `some_date_field:[2002-10-02T15:00:00Z TO
/// 2002-10-02T18:00:00Z}`
///
/// * all docs query: A plain `*` will match all documents in the index.
///
/// Parts of the queries can be boosted by appending `^boostfactor`.
/// For instance, `"SRE"^2.0 OR devops^0.4` will boost documents containing `SRE` instead of
/// devops. Negative boosts are not allowed.
///
/// It is also possible to define a boost for a some specific field, at the query parser level.
/// (See [`set_field_boost(...)`](QueryParser::set_field_boost)). Typically you may want to boost a
/// title field.
///
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
#[derive(Clone)]
pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
conjunction_by_default: bool,
tokenizer_manager: TokenizerManager,
boost: HashMap<Field, Score>,
}
fn all_negative(ast: &LogicalAst) -> bool {
match ast {
LogicalAst::Leaf(_) => false,
LogicalAst::Boost(ref child_ast, _) => all_negative(child_ast),
LogicalAst::Clause(children) => children
.iter()
.all(|(ref occur, child)| (*occur == Occur::MustNot) || all_negative(child)),
}
}
impl QueryParser {
/// Creates a `QueryParser`, given
/// * schema - index Schema
/// * default_fields - fields used to search if no field is specifically defined in the query.
pub fn new(
schema: Schema,
default_fields: Vec<Field>,
tokenizer_manager: TokenizerManager,
) -> QueryParser {
QueryParser {
schema,
default_fields,
tokenizer_manager,
conjunction_by_default: false,
boost: Default::default(),
}
}
// Splits a full_path as written in a query, into a field name and a
// json path.
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
self.schema.find_field(full_path)
}
/// Creates a `QueryParser`, given
/// * an index
/// * a set of default fields used to search if no field is specifically defined
/// in the query.
pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
}
/// Set the default way to compose queries to a conjunction.
///
/// By default, the query `happy tax payer` is equivalent to the query
/// `happy OR tax OR payer`. After calling `.set_conjunction_by_default()`
/// `happy tax payer` will be interpreted by the parser as `happy AND tax AND payer`.
pub fn set_conjunction_by_default(&mut self) {
self.conjunction_by_default = true;
}
/// Sets a boost for a specific field.
///
/// The parse query will automatically boost this field.
///
/// If the query defines a query boost through the query language (e.g: `country:France^3.0`),
/// the two boosts (the one defined in the query, and the one defined in the `QueryParser`)
/// are multiplied together.
pub fn set_field_boost(&mut self, field: Field, boost: Score) {
self.boost.insert(field, boost);
}
/// Parse a query
///
/// Note that `parse_query` returns an error if the input
/// is not a valid query.
///
/// There is currently no lenient mode for the query parser
/// which makes it a bad choice for a public/broad user search engine.
///
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<dyn Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(logical_ast))
}
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
let user_input_ast = tantivy_query_grammar::parse_query(query)
.map_err(|_| QueryParserError::SyntaxError(query.to_string()))?;
self.compute_logical_ast(user_input_ast)
}
fn compute_logical_ast(
&self,
user_input_ast: UserInputAst,
) -> Result<LogicalAst, QueryParserError> {
let ast = self.compute_logical_ast_with_occur(user_input_ast)?;
if let LogicalAst::Clause(children) = &ast {
if children.is_empty() {
return Ok(ast);
}
}
if all_negative(&ast) {
return Err(QueryParserError::AllButQueryForbidden);
}
Ok(ast)
}
fn compute_boundary_term(
&self,
field: Field,
json_path: &str,
phrase: &str,
) -> Result<Term, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let is_ip_and_fast = field_type.is_ip_addr() && field_type.is_fast();
if !field_type.is_indexed() && !is_ip_and_fast {
return Err(QueryParserError::FieldNotIndexed(
field_entry.name().to_string(),
));
}
if !json_path.is_empty() && field_type.value_type() != Type::Json {
return Err(QueryParserError::UnsupportedQuery(format!(
"Json path is not supported for field {:?}",
field_entry.name()
)));
}
match *field_type {
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
Ok(Term::from_field_u64(field, val))
}
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
Ok(Term::from_field_i64(field, val))
}
FieldType::F64(_) => {
let val: f64 = f64::from_str(phrase)?;
Ok(Term::from_field_f64(field, val))
}
FieldType::Bool(_) => {
let val: bool = bool::from_str(phrase)?;
Ok(Term::from_field_bool(field, val))
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
}
FieldType::Str(ref str_options) => {
let option = str_options.get_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
})?;
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_entry.name().to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let mut terms: Vec<Term> = Vec::new();
let mut token_stream = text_analyzer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.text);
terms.push(term);
});
if terms.len() != 1 {
return Err(QueryParserError::UnsupportedQuery(format!(
"Range query boundary cannot have multiple tokens: {phrase:?}."
)));
}
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(_) => {
// Json range are not supported.
Err(QueryParserError::UnsupportedQuery(
"Range query are not supported on json field.".to_string(),
))
}
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => Ok(Term::from_facet(field, &facet)),
Err(e) => Err(QueryParserError::from(e)),
},
FieldType::Bytes(_) => {
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
Ok(Term::from_field_bytes(field, &bytes))
}
FieldType::IpAddr(_) => {
let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
Ok(Term::from_field_ip_addr(field, ip_v6))
}
}
}
fn compute_logical_ast_for_leaf(
&self,
field: Field,
json_path: &str,
phrase: &str,
slop: u32,
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let field_name = field_entry.name();
if !field_type.is_indexed() {
return Err(QueryParserError::FieldNotIndexed(field_name.to_string()));
}
if field_type.value_type() != Type::Json && !json_path.is_empty() {
let field_name = self.schema.get_field_name(field);
return Err(QueryParserError::FieldDoesNotExist(format!(
"{field_name}.{json_path}"
)));
}
match *field_type {
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
let i64_term = Term::from_field_u64(field, val);
Ok(vec![LogicalLiteral::Term(i64_term)])
}
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
let i64_term = Term::from_field_i64(field, val);
Ok(vec![LogicalLiteral::Term(i64_term)])
}
FieldType::F64(_) => {
let val: f64 = f64::from_str(phrase)?;
let f64_term = Term::from_field_f64(field, val);
Ok(vec![LogicalLiteral::Term(f64_term)])
}
FieldType::Bool(_) => {
let val: bool = bool::from_str(phrase)?;
let bool_term = Term::from_field_bool(field, val);
Ok(vec![LogicalLiteral::Term(bool_term)])
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
Ok(vec![LogicalLiteral::Term(dt_term)])
}
FieldType::Str(ref str_options) => {
let option = str_options.get_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let index_record_option = option.index_option();
Ok(generate_literals_for_str(
field_name,
field,
phrase,
slop,
&text_analyzer,
index_record_option,
)?
.into_iter()
.collect())
}
FieldType::JsonObject(ref json_options) => generate_literals_for_json_object(
field_name,
field,
json_path,
phrase,
&self.tokenizer_manager,
json_options,
),
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => {
let facet_term = Term::from_facet(field, &facet);
Ok(vec![LogicalLiteral::Term(facet_term)])
}
Err(e) => Err(QueryParserError::from(e)),
},
FieldType::Bytes(_) => {
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
let bytes_term = Term::from_field_bytes(field, &bytes);
Ok(vec![LogicalLiteral::Term(bytes_term)])
}
FieldType::IpAddr(_) => {
let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
let term = Term::from_field_ip_addr(field, ip_v6);
Ok(vec![LogicalLiteral::Term(term)])
}
}
}
fn default_occur(&self) -> Occur {
if self.conjunction_by_default {
Occur::Must
} else {
Occur::Should
}
}
fn resolve_bound(
&self,
field: Field,
json_path: &str,
bound: &UserInputBound,
) -> Result<Bound<Term>, QueryParserError> {
if bound.term_str() == "*" {
return Ok(Bound::Unbounded);
}
let term = self.compute_boundary_term(field, json_path, bound.term_str())?;
match *bound {
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
UserInputBound::Unbounded => Ok(Bound::Unbounded),
}
}
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAst,
) -> Result<LogicalAst, QueryParserError> {
match user_input_ast {
UserInputAst::Clause(sub_queries) => {
let default_occur = self.default_occur();
let mut logical_sub_queries: Vec<(Occur, LogicalAst)> = Vec::new();
for (occur_opt, sub_ast) in sub_queries {
let sub_ast = self.compute_logical_ast_with_occur(sub_ast)?;
let occur = occur_opt.unwrap_or(default_occur);
logical_sub_queries.push((occur, sub_ast));
}
Ok(LogicalAst::Clause(logical_sub_queries))
}
UserInputAst::Boost(ast, boost) => {
let ast = self.compute_logical_ast_with_occur(*ast)?;
Ok(ast.boost(boost as Score))
}
UserInputAst::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf),
}
}
fn field_boost(&self, field: Field) -> Score {
self.boost.get(&field).cloned().unwrap_or(1.0)
}
fn default_indexed_json_fields(&self) -> impl Iterator<Item = Field> + '_ {
let schema = self.schema.clone();
self.default_fields.iter().cloned().filter(move |field| {
let field_type = schema.get_field_entry(*field).field_type();
field_type.value_type() == Type::Json && field_type.is_indexed()
})
}
/// Given a literal, returns the list of terms that should be searched.
///
/// The terms are identified by a triplet:
/// - tantivy field
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
/// object by naturally extending the json field name with a "." separated field_path
/// - field_phrase: the phrase that is being searched.
///
/// The literal identifies the targeted field by a so-called *full field path*,
/// specified before the ":". (e.g. identity.username:fulmicoton).
///
/// The way we split the full field path into (field_name, field_path) can be ambiguous,
/// because field_names can contain "." themselves.
// For instance if a field is named `one.two` and another one is named `one`,
/// should `one.two:three` target `one.two` with field path `` or or `one` with
/// the field path `two`.
///
/// In this case tantivy, just picks the solution with the longest field name.
///
/// Quirk: As a hack for quickwit, we do not split over a dot that appear escaped '\.'.
fn compute_path_triplets_for_literal<'a>(
&self,
literal: &'a UserInputLiteral,
) -> Result<Vec<(Field, &'a str, &'a str)>, QueryParserError> {
let full_path = if let Some(full_path) = &literal.field_name {
full_path
} else {
// The user did not specify any path...
// We simply target default fields.
if self.default_fields.is_empty() {
return Err(QueryParserError::NoDefaultFieldDeclared);
}
return Ok(self
.default_fields
.iter()
.map(|default_field| (*default_field, "", literal.phrase.as_str()))
.collect::<Vec<(Field, &str, &str)>>());
};
if let Some((field, path)) = self.split_full_path(full_path) {
return Ok(vec![(field, path, literal.phrase.as_str())]);
}
// We need to add terms associated with json default fields.
let triplets: Vec<(Field, &str, &str)> = self
.default_indexed_json_fields()
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
.collect();
if triplets.is_empty() {
return Err(QueryParserError::FieldDoesNotExist(full_path.to_string()));
}
Ok(triplets)
}
fn compute_logical_ast_from_leaf(
&self,
leaf: UserInputLeaf,
) -> Result<LogicalAst, QueryParserError> {
match leaf {
UserInputLeaf::Literal(literal) => {
let term_phrases: Vec<(Field, &str, &str)> =
self.compute_path_triplets_for_literal(&literal)?;
let mut asts: Vec<LogicalAst> = Vec::new();
for (field, json_path, phrase) in term_phrases {
for ast in
self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
{
// Apply some field specific boost defined at the query parser level.
let boost = self.field_boost(field);
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
}
}
let result_ast: LogicalAst = if asts.len() == 1 {
asts.into_iter().next().unwrap()
} else {
LogicalAst::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
Ok(result_ast)
}
UserInputLeaf::All => Ok(LogicalAst::Leaf(Box::new(LogicalLiteral::All))),
UserInputLeaf::Range {
field: full_field_opt,
lower,
upper,
} => {
let full_path = full_field_opt.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Range query need to target a specific field.".to_string(),
)
})?;
let (field, json_path) = self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
field,
value_type,
lower: self.resolve_bound(field, json_path, &lower)?,
upper: self.resolve_bound(field, json_path, &upper)?,
}));
Ok(logical_ast)
}
UserInputLeaf::Set {
field: full_field_opt,
elements,
} => {
let full_path = full_field_opt.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Set query need to target a specific field.".to_string(),
)
})?;
let (field, json_path) = self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
elements: elements
.into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element))
.collect::<Result<Vec<_>, _>>()?,
field,
value_type,
}));
Ok(logical_ast)
}
}
}
}
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
match logical_literal {
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
),
LogicalLiteral::Range {
field,
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}
}
fn generate_literals_for_str(
field_name: &str,
field: Field,
phrase: &str,
slop: u32,
text_analyzer: &TextAnalyzer,
index_record_option: IndexRecordOption,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let mut terms: Vec<(usize, Term)> = Vec::new();
let mut token_stream = text_analyzer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.text);
terms.push((token.position, term));
});
if terms.len() <= 1 {
let term_literal_opt = terms
.into_iter()
.next()
.map(|(_, term)| LogicalLiteral::Term(term));
return Ok(term_literal_opt);
}
if !index_record_option.has_positions() {
return Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
field_name.to_string(),
));
}
Ok(Some(LogicalLiteral::Phrase(terms, slop)))
}
fn generate_literals_for_json_object(
field_name: &str,
field: Field,
json_path: &str,
phrase: &str,
tokenizer_manager: &TokenizerManager,
json_options: &JsonObjectOptions,
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let text_options = json_options.get_text_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer = tokenizer_manager
.get(text_options.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: text_options.tokenizer().to_string(),
})?;
let index_record_option = text_options.index_option();
let mut logical_literals = Vec::new();
let mut term = Term::with_capacity(100);
let mut json_term_writer = JsonTermWriter::from_field_and_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
&mut term,
);
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
drop(json_term_writer);
if terms.len() <= 1 {
for (_, term) in terms {
logical_literals.push(LogicalLiteral::Term(term));
}
return Ok(logical_literals);
}
if !index_record_option.has_positions() {
return Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
field_name.to_string(),
));
}
logical_literals.push(LogicalLiteral::Phrase(terms, 0));
Ok(logical_literals)
}
fn convert_to_query(logical_ast: LogicalAst) -> Box<dyn Query> {
match trim_ast(logical_ast) {
Some(LogicalAst::Clause(trimmed_clause)) => {
let occur_subqueries = trimmed_clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
assert!(
!occur_subqueries.is_empty(),
"Should not be empty after trimming"
);
Box::new(BooleanQuery::new(occur_subqueries))
}
Some(LogicalAst::Leaf(trimmed_logical_literal)) => {
convert_literal_to_query(*trimmed_logical_literal)
}
Some(LogicalAst::Boost(ast, boost)) => {
let query = convert_to_query(*ast);
let boosted_query = BoostQuery::new(query, boost);
Box::new(boosted_query)
}
None => Box::new(EmptyQuery),
}
}
#[cfg(test)]
mod test {
use matches::assert_matches;
use super::super::logical_ast::*;
use super::{QueryParser, QueryParserError};
use crate::query::Query;
use crate::schema::{
FacetOptions, Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions,
INDEXED, STORED, STRING, TEXT,
};
use crate::tokenizer::{
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
};
use crate::Index;
fn make_schema() -> Schema {
let mut schema_builder = Schema::builder();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("en_with_stop_words")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("text", TEXT);
schema_builder.add_i64_field("signed", INDEXED);
schema_builder.add_u64_field("unsigned", INDEXED);
schema_builder.add_text_field("notindexed_text", STORED);
schema_builder.add_text_field("notindexed_u64", STORED);
schema_builder.add_text_field("notindexed_i64", STORED);
schema_builder.add_text_field("nottokenized", STRING);
schema_builder.add_text_field("with_stop_words", text_options);
schema_builder.add_date_field("date", INDEXED);
schema_builder.add_f64_field("float", INDEXED);
schema_builder.add_facet_field("facet", FacetOptions::default());
schema_builder.add_bytes_field("bytes", INDEXED);
schema_builder.add_bytes_field("bytes_not_indexed", STORED);
schema_builder.add_json_field("json", TEXT);
schema_builder.add_json_field("json_not_indexed", STORED);
schema_builder.add_bool_field("bool", INDEXED);
schema_builder.add_bool_field("notindexed_bool", STORED);
schema_builder.build()
}
fn make_query_parser_with_default_fields(default_fields: &[&'static str]) -> QueryParser {
let schema = make_schema();
let default_fields: Vec<Field> = default_fields
.iter()
.flat_map(|field_name| schema.get_field(field_name))
.collect();
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::from(SimpleTokenizer)
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()])),
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}
fn make_query_parser() -> QueryParser {
make_query_parser_with_default_fields(&["title", "text"])
}
fn parse_query_to_logical_ast(
query: &str,
default_conjunction: bool,
) -> Result<LogicalAst, QueryParserError> {
let mut query_parser = make_query_parser();
if default_conjunction {
query_parser.set_conjunction_by_default();
}
query_parser.parse_query_to_logical_ast(query)
}
fn test_parse_query_to_logical_ast_helper(
query: &str,
expected: &str,
default_conjunction: bool,
) {
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
}
#[test]
pub fn test_parse_query_facet() {
let query_parser = make_query_parser();
let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap();
assert_eq!(
format!("{:?}", query),
r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"#
);
}
#[test]
pub fn test_parse_query_with_boost() {
let mut query_parser = make_query_parser();
let schema = make_schema();
let text_field = schema.get_field("text").unwrap();
query_parser.set_field_boost(text_field, 2.0);
let query = query_parser.parse_query("text:hello").unwrap();
assert_eq!(
format!("{:?}", query),
r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"#
);
}
#[test]
pub fn test_parse_query_range_with_boost() {
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
assert_eq!(
format!("{:?}", query),
"RangeQuery { field: Field(0), value_type: Str, left_bound: Included([97]), \
right_bound: Included([98]) }"
);
}
#[test]
pub fn test_parse_query_with_default_boost_and_custom_boost() {
let mut query_parser = make_query_parser();
let schema = make_schema();
let text_field = schema.get_field("text").unwrap();
query_parser.set_field_boost(text_field, 2.0);
let query = query_parser.parse_query("text:hello^2").unwrap();
assert_eq!(
format!("{:?}", query),
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"#
);
}
#[test]
pub fn test_parse_nonindexed_field_yields_error() {
let query_parser = make_query_parser();
let is_not_indexed_err = |query: &str| {
let result: Result<Box<dyn Query>, QueryParserError> = query_parser.parse_query(query);
if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
Some(field_name)
} else {
None
}
};
assert_eq!(
is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text"))
);
assert_eq!(
is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64"))
);
assert_eq!(
is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64"))
);
assert_eq!(
is_not_indexed_err("notindexed_bool:true"),
Some(String::from("notindexed_bool"))
);
}
#[test]
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"",
r#"Term(type=Str, field=7, "wordone wordtwo")"#,
false,
);
}
#[test]
pub fn test_parse_query_empty() {
test_parse_query_to_logical_ast_helper("", "<emptyclause>", false);
test_parse_query_to_logical_ast_helper(" ", "<emptyclause>", false);
let query_parser = make_query_parser();
let query_result = query_parser.parse_query("");
let query = query_result.unwrap();
assert_eq!(format!("{:?}", query), "EmptyQuery");
}
#[test]
pub fn test_parse_query_ints() {
let query_parser = make_query_parser();
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok());
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err());
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
assert!(query_parser.parse_query("float:\"3.1\"").is_ok());
assert!(query_parser.parse_query("float:\"-2.4\"").is_ok());
assert!(query_parser.parse_query("float:\"2.1.2\"").is_err());
assert!(query_parser.parse_query("float:\"2.1a\"").is_err());
assert!(query_parser
.parse_query("float:\"18446744073709551615.0\"")
.is_ok());
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term(type=U64, field=3, 2324)",
false,
);
test_parse_query_to_logical_ast_helper(
"signed:-2324",
&format!(
"{:?}",
Term::from_field_i64(Field::from_field_id(2u32), -2324)
),
false,
);
test_parse_query_to_logical_ast_helper(
"float:2.5",
&format!(
"{:?}",
Term::from_field_f64(Field::from_field_id(10u32), 2.5)
),
false,
);
}
#[test]
fn test_parse_bytes() {
test_parse_query_to_logical_ast_helper(
"bytes:YnVidQ==",
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
false,
);
}
#[test]
fn test_parse_bool() {
test_parse_query_to_logical_ast_helper(
"bool:true",
&format!(
"{:?}",
Term::from_field_bool(Field::from_field_id(16u32), true),
),
false,
);
}
#[test]
fn test_parse_bytes_not_indexed() {
let error = parse_query_to_logical_ast("bytes_not_indexed:aaa", false).unwrap_err();
assert!(matches!(error, QueryParserError::FieldNotIndexed(_)));
}
#[test]
fn test_json_field() {
test_parse_query_to_logical_ast_helper(
"json.titi:hello",
"Term(type=Json, field=14, path=titi, vtype=Str, \"hello\")",
false,
);
}
fn extract_query_term_json_path(query: &str) -> String {
let LogicalAst::Leaf(literal) = parse_query_to_logical_ast(query, false).unwrap() else {
panic!();
};
let LogicalLiteral::Term(term) = *literal else {
panic!();
};
std::str::from_utf8(term.value_bytes()).unwrap().to_string()
}
#[test]
fn test_json_field_query_with_espaced_dot() {
assert_eq!(
extract_query_term_json_path(r#"json.k8s.node.name:hello"#),
"k8s\u{1}node\u{1}name\0shello"
);
assert_eq!(
extract_query_term_json_path(r#"json.k8s\.node\.name:hello"#),
"k8s.node.name\0shello"
);
}
#[test]
fn test_json_field_possibly_a_number() {
test_parse_query_to_logical_ast_helper(
"json.titi:5",
r#"(Term(type=Json, field=14, path=titi, vtype=U64, 5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"json.titi:-5",
r#"(Term(type=Json, field=14, path=titi, vtype=I64, -5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-".
true,
);
test_parse_query_to_logical_ast_helper(
"json.titi:-5.2",
r#"(Term(type=Json, field=14, path=titi, vtype=F64, -5.2) "[(0, Term(type=Json, field=14, path=titi, vtype=Str, "5")), (1, Term(type=Json, field=14, path=titi, vtype=Str, "2"))]")"#,
true,
);
}
#[test]
fn test_json_field_possibly_a_date() {
test_parse_query_to_logical_ast_helper(
r#"json.date:"2019-10-12T07:20:50.52Z""#,
r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#,
true,
);
}
#[test]
fn test_json_field_possibly_a_bool() {
test_parse_query_to_logical_ast_helper(
"json.titi:true",
r#"(Term(type=Json, field=14, path=titi, vtype=Bool, true) Term(type=Json, field=14, path=titi, vtype=Str, "true"))"#,
true,
);
}
#[test]
fn test_json_field_not_indexed() {
let error = parse_query_to_logical_ast("json_not_indexed.titi:hello", false).unwrap_err();
assert!(matches!(error, QueryParserError::FieldNotIndexed(_)));
}
fn test_query_to_logical_ast_with_default_json(
query: &str,
expected: &str,
default_conjunction: bool,
) {
let mut query_parser = make_query_parser_with_default_fields(&["json"]);
if default_conjunction {
query_parser.set_conjunction_by_default();
}
let ast = query_parser.parse_query_to_logical_ast(query).unwrap();
let ast_str = format!("{ast:?}");
assert_eq!(ast_str, expected);
}
#[test]
fn test_json_default() {
test_query_to_logical_ast_with_default_json(
"titi:4",
"(Term(type=Json, field=14, path=titi, vtype=U64, 4) Term(type=Json, field=14, \
path=titi, vtype=Str, \"4\"))",
false,
);
}
#[test]
fn test_json_default_with_different_field() {
for conjunction in [false, true] {
test_query_to_logical_ast_with_default_json(
"text:4",
r#"Term(type=Str, field=1, "4")"#,
conjunction,
);
}
}
#[test]
fn test_json_default_with_same_field() {
for conjunction in [false, true] {
test_query_to_logical_ast_with_default_json(
"json:4",
r#"(Term(type=Json, field=14, path=, vtype=U64, 4) Term(type=Json, field=14, path=, vtype=Str, "4"))"#,
conjunction,
);
}
}
#[test]
fn test_parse_bytes_phrase() {
test_parse_query_to_logical_ast_helper(
"bytes:\"YnVidQ==\"",
"Term(type=Bytes, field=12, [98, 117, 98, 117])",
false,
);
}
#[test]
fn test_parse_bytes_invalid_base64() {
let base64_err: QueryParserError =
parse_query_to_logical_ast("bytes:aa", false).unwrap_err();
assert!(matches!(base64_err, QueryParserError::ExpectedBase64(_)));
}
#[test]
fn test_parse_query_to_ast_ab_c() {
test_parse_query_to_logical_ast_helper(
"(+title:a +title:b) title:c",
r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"(+title:a +title:b) title:c",
r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#,
true,
);
}
#[test]
pub fn test_parse_query_to_ast_single_term() {
test_parse_query_to_logical_ast_helper(
"title:toto",
r#"Term(type=Str, field=0, "toto")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
r#"Term(type=Str, field=0, "toto")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
false,
);
}
#[test]
fn test_single_negative_term() {
assert_matches!(
parse_query_to_logical_ast("-title:toto", false),
Err(QueryParserError::AllButQueryForbidden)
);
}
#[test]
pub fn test_parse_query_to_ast_two_terms() {
test_parse_query_to_logical_ast_helper(
"title:a b",
r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
r#"title:"a b""#,
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
false,
);
}
#[test]
pub fn test_parse_query_all_query() {
let logical_ast = parse_query_to_logical_ast("*", false).unwrap();
assert_eq!(format!("{logical_ast:?}"), "*");
}
#[test]
pub fn test_parse_query_range_require_a_target_field() {
let query_parser_error = parse_query_to_logical_ast("[A TO B]", false).err().unwrap();
assert_eq!(
query_parser_error.to_string(),
"Unsupported query: Range query need to target a specific field."
);
}
#[test]
pub fn test_parse_query_to_ast_ranges() {
test_parse_query_to_logical_ast_helper(
"title:[a TO b]",
r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO toto}",
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{* TO toto}",
r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO *}",
r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#,
false,
);
test_parse_query_to_logical_ast_helper(
"signed:{-5 TO 3}",
r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#,
false,
);
test_parse_query_to_logical_ast_helper(
"float:{-1.5 TO 1.5}",
r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#,
false,
);
}
#[test]
pub fn test_query_parser_field_does_not_exist() {
let query_parser = make_query_parser();
assert_eq!(
query_parser
.parse_query("boujou:\"18446744073709551615\"")
.unwrap_err(),
QueryParserError::FieldDoesNotExist("boujou".to_string())
);
}
#[test]
pub fn test_query_parser_field_not_indexed() {
let query_parser = make_query_parser();
assert_matches!(
query_parser.parse_query("notindexed_text:\"18446744073709551615\""),
Err(QueryParserError::FieldNotIndexed(_))
);
}
#[test]
pub fn test_unknown_tokenizer() {
let mut schema_builder = Schema::builder();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("nonexistingtokenizer")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let default_fields = vec![title];
let tokenizer_manager = TokenizerManager::default();
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
assert_matches!(
query_parser.parse_query("title:\"happy tax payer\""),
Err(QueryParserError::UnknownTokenizer { .. })
);
}
#[test]
pub fn test_query_parser_no_positions() {
let mut schema_builder = Schema::builder();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("customtokenizer")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("customtokenizer", SimpleTokenizer);
let query_parser = QueryParser::for_index(&index, vec![title]);
assert_eq!(
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),
QueryParserError::FieldDoesNotHavePositionsIndexed("title".to_string())
);
}
#[test]
pub fn test_query_parser_expected_int() {
let query_parser = make_query_parser();
assert_matches!(
query_parser.parse_query("unsigned:18a"),
Err(QueryParserError::ExpectedInt(_))
);
assert!(query_parser.parse_query("unsigned:\"18\"").is_ok());
assert_matches!(
query_parser.parse_query("signed:18b"),
Err(QueryParserError::ExpectedInt(_))
);
assert!(query_parser.parse_query("float:\"1.8\"").is_ok());
assert_matches!(
query_parser.parse_query("float:1.8a"),
Err(QueryParserError::ExpectedFloat(_))
);
}
#[test]
pub fn test_query_parser_expected_bool() {
let query_parser = make_query_parser();
assert_matches!(
query_parser.parse_query("bool:brie"),
Err(QueryParserError::ExpectedBool(_))
);
assert!(query_parser.parse_query("bool:\"true\"").is_ok());
assert!(query_parser.parse_query("bool:\"false\"").is_ok());
}
#[test]
pub fn test_query_parser_expected_date() {
let query_parser = make_query_parser();
assert_matches!(
query_parser.parse_query("date:18a"),
Err(QueryParserError::DateFormatError(_))
);
test_parse_query_to_logical_ast_helper(
r#"date:"2010-11-21T09:55:06.000000000+02:00""#,
r#"Term(type=Date, field=9, 2010-11-21T07:55:06Z)"#,
true,
);
test_parse_query_to_logical_ast_helper(
r#"date:"1985-04-12T23:20:50.52Z""#,
r#"Term(type=Date, field=9, 1985-04-12T23:20:50Z)"#,
true,
);
}
#[test]
pub fn test_query_parser_expected_facet() {
let query_parser = make_query_parser();
match query_parser.parse_query("facet:INVALID") {
Ok(_) => panic!("should never succeed"),
Err(e) => assert_eq!(
"The facet field is malformed: Failed to parse the facet string: 'INVALID'",
format!("{}", e)
),
}
assert!(query_parser.parse_query("facet:\"/foo/bar\"").is_ok());
}
#[test]
pub fn test_query_parser_not_empty_but_no_tokens() {
let query_parser = make_query_parser();
assert!(query_parser.parse_query(" !, ").is_ok());
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
}
#[test]
pub fn test_parse_query_single_negative_term_through_error() {
assert_matches!(
parse_query_to_logical_ast("-title:toto", true),
Err(QueryParserError::AllButQueryForbidden)
);
assert_matches!(
parse_query_to_logical_ast("-title:toto", false),
Err(QueryParserError::AllButQueryForbidden)
);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper(
"title:toto",
r#"Term(type=Str, field=0, "toto")"#,
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
r#"Term(type=Str, field=0, "toto")"#,
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"title:a b",
r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#,
true,
);
}
#[test]
pub fn test_query_parser_hyphen() {
test_parse_query_to_logical_ast_helper(
"title:www-form-encoded",
r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#,
false,
);
}
#[test]
fn test_and_default_regardless_of_default_conjunctive() {
for &default_conjunction in &[false, true] {
test_parse_query_to_logical_ast_helper(
"title:a AND title:b",
r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#,
default_conjunction,
);
}
}
#[test]
fn test_or_default_conjunctive() {
for &default_conjunction in &[false, true] {
test_parse_query_to_logical_ast_helper(
"title:a OR title:b",
r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#,
default_conjunction,
);
}
}
#[test]
fn test_escaped_field() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field(r#"a\.b"#, STRING);
let schema = schema_builder.build();
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
assert_eq!(
format!("{:?}", query),
"TermQuery(Term(type=Str, field=0, \"hello\"))"
);
}
#[test]
fn test_split_full_path() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("second", STRING);
schema_builder.add_text_field("first", STRING);
schema_builder.add_text_field("first.toto", STRING);
schema_builder.add_text_field("first.toto.titi", STRING);
schema_builder.add_text_field("third.a.b.c", STRING);
let schema = schema_builder.build();
let query_parser =
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
assert_eq!(
query_parser.split_full_path("first.toto"),
Some((schema.get_field("first.toto").unwrap(), ""))
);
assert_eq!(
query_parser.split_full_path("first.toto.bubu"),
Some((schema.get_field("first.toto").unwrap(), "bubu"))
);
assert_eq!(
query_parser.split_full_path("first.toto.titi"),
Some((schema.get_field("first.toto.titi").unwrap(), ""))
);
assert_eq!(
query_parser.split_full_path("first.titi"),
Some((schema.get_field("first").unwrap(), "titi"))
);
assert_eq!(query_parser.split_full_path("third"), None);
assert_eq!(query_parser.split_full_path("hello.toto"), None);
assert_eq!(query_parser.split_full_path(""), None);
assert_eq!(query_parser.split_full_path("firsty"), None);
}
#[test]
pub fn test_phrase_slop() {
test_parse_query_to_logical_ast_helper(
"\"a b\"~0",
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]" "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"\"a b\"~2",
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]"~2 "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]"~2)"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b~4\"~2",
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b")), (2, Term(type=Str, field=0, "4"))]"~2"#,
false,
);
}
#[test]
pub fn test_term_set_query() {
test_parse_query_to_logical_ast_helper(
"title: IN [a b cd]",
r#"IN [Term(type=Str, field=0, "a"), Term(type=Str, field=0, "b"), Term(type=Str, field=0, "cd")]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"bytes: IN [AA== ABA= ABCD]",
r#"IN [Term(type=Bytes, field=12, [0]), Term(type=Bytes, field=12, [0, 16]), Term(type=Bytes, field=12, [0, 16, 131])]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"signed: IN [1 2 -3]",
r#"IN [Term(type=I64, field=2, 1), Term(type=I64, field=2, 2), Term(type=I64, field=2, -3)]"#,
false,
);
test_parse_query_to_logical_ast_helper(
"float: IN [1.1 2.2 -3.3]",
r#"IN [Term(type=F64, field=10, 1.1), Term(type=F64, field=10, 2.2), Term(type=F64, field=10, -3.3)]"#,
false,
);
}
}