mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
issues/65 Phrase query for untokenized fields are not tokenized.
This commit is contained in:
@@ -2,8 +2,11 @@ Tantivy 0.4.0
|
||||
==========================
|
||||
|
||||
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
|
||||
-
|
||||
|
||||
- QueryParser:
|
||||
- Explicit error returned when searched for a term that is not indexed
|
||||
- Searching for a int term via the query parser was broken `(age:1)`
|
||||
- Searching for a non-indexed field returns an explicit Error
|
||||
- Phrase query for non-tokenized field are not tokenized by the query parser.
|
||||
|
||||
Tantivy 0.3.1
|
||||
==========================
|
||||
|
||||
@@ -16,7 +16,7 @@ impl FastFieldNotAvailableError {
|
||||
/// for which fast fields are not available.
|
||||
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
|
||||
FastFieldNotAvailableError {
|
||||
field_name: field_entry.name().clone(),
|
||||
field_name: field_entry.name().to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ pub struct FastFieldsWriter {
|
||||
|
||||
impl FastFieldsWriter {
|
||||
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let field_writers: Vec<IntFastFieldWriter> = schema
|
||||
.fields()
|
||||
@@ -47,6 +48,9 @@ impl FastFieldsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `FastFieldsWriter`
|
||||
/// with a `IntFastFieldWriter` for each
|
||||
/// of the field given in argument.
|
||||
pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
|
||||
FastFieldsWriter {
|
||||
field_writers: fields
|
||||
|
||||
@@ -374,14 +374,14 @@ impl SegmentUpdater {
|
||||
self.run_async(move |segment_updater| {
|
||||
debug!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
let mut file_protection_opt = None;
|
||||
let mut _file_protection_opt = None;
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.0.index.opstamp();
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
|
||||
match advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp) {
|
||||
Ok(file_protection_opt_res) => {
|
||||
file_protection_opt = file_protection_opt_res;
|
||||
_file_protection_opt = file_protection_opt_res;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", before_merge_segment_ids, e);
|
||||
|
||||
@@ -10,8 +10,21 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
|
||||
phrase.or(word)
|
||||
};
|
||||
let field = many1(letter());
|
||||
let term_query = (field, char(':'), term_val()).map(|(field_name, _, phrase)| {
|
||||
|
||||
let negative_numbers =
|
||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let field =
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))
|
||||
)
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let term_val_with_field = negative_numbers.or(term_val());
|
||||
|
||||
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
|
||||
UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase: phrase,
|
||||
|
||||
@@ -10,7 +10,9 @@ use postings::SegmentPostingsOption;
|
||||
use query::PhraseQuery;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::StreamingIterator;
|
||||
use schema::Term;
|
||||
use schema::{Term, FieldType};
|
||||
use std::str::FromStr;
|
||||
use std::num::ParseIntError;
|
||||
|
||||
|
||||
|
||||
@@ -24,15 +26,23 @@ pub enum QueryParserError {
|
||||
FieldDoesNotExist(String),
|
||||
/// The query contains a term for a `u64`-field, but the value
|
||||
/// is not a u64.
|
||||
ExpectedU64(String, String),
|
||||
ExpectedInt(ParseIntError),
|
||||
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
|
||||
AllButQueryForbidden,
|
||||
/// If no default field is declared, running a query without any
|
||||
/// field specified is forbbidden.
|
||||
NoDefaultFieldDeclared,
|
||||
/// The field searched for is not declared
|
||||
/// as indexed in the schema.
|
||||
FieldNotIndexed(String),
|
||||
}
|
||||
|
||||
|
||||
impl From<ParseIntError> for QueryParserError {
|
||||
fn from(err: ParseIntError) -> QueryParserError {
|
||||
QueryParserError::ExpectedInt(err)
|
||||
}
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
@@ -121,7 +131,7 @@ impl QueryParser {
|
||||
fn compute_logical_ast(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
|
||||
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if occur == Occur::MustNot {
|
||||
return Err(QueryParserError::AllButQueryForbidden);
|
||||
}
|
||||
@@ -132,25 +142,51 @@ impl QueryParser {
|
||||
field: Field,
|
||||
phrase: &str)
|
||||
-> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let mut token_iter = self.analyzer.tokenize(phrase);
|
||||
let mut tokens: Vec<Term> = Vec::new();
|
||||
loop {
|
||||
if let Some(token) = token_iter.next() {
|
||||
let text = token.to_string();
|
||||
// TODO Handle u64
|
||||
let term = Term::from_field_text(field, &text);
|
||||
tokens.push(term);
|
||||
} else {
|
||||
break;
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
if !field_type.is_indexed() {
|
||||
let field_name = field_entry.name().to_string();
|
||||
return Err(QueryParserError::FieldNotIndexed(field_name));
|
||||
}
|
||||
match field_type {
|
||||
&FieldType::I64(_) => {
|
||||
let val: i64 = i64::from_str(phrase)?;
|
||||
let term = Term::from_field_i64(field, val);
|
||||
return Ok(Some(LogicalLiteral::Term(term)));
|
||||
}
|
||||
&FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
let term = Term::from_field_u64(field, val);
|
||||
return Ok(Some(LogicalLiteral::Term(term)));
|
||||
}
|
||||
&FieldType::Str(ref str_options) => {
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
if str_options.get_indexing_options().is_tokenized() {
|
||||
let mut token_iter = self.analyzer.tokenize(phrase);
|
||||
loop {
|
||||
if let Some(token) = token_iter.next() {
|
||||
let term = Term::from_field_text(field, token);
|
||||
terms.push(term);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
terms.push(Term::from_field_text(field, phrase));
|
||||
}
|
||||
if terms.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
else if terms.len() == 1 {
|
||||
return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
|
||||
} else {
|
||||
return Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
}
|
||||
}
|
||||
}
|
||||
if tokens.is_empty() {
|
||||
Ok(None)
|
||||
} else if tokens.len() == 1 {
|
||||
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
|
||||
} else {
|
||||
Ok(Some(LogicalLiteral::Phrase(tokens)))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fn default_occur(&self) -> Occur {
|
||||
@@ -208,23 +244,23 @@ impl QueryParser {
|
||||
asts.push(LogicalAST::Leaf(box ast));
|
||||
}
|
||||
}
|
||||
let result_ast = if asts.len() == 0 {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
let result_ast =
|
||||
if asts.len() == 0 {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
Ok((Occur::Should, result_ast))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Compose two occur values.
|
||||
fn compose_occur(left: Occur, right: Occur) -> Occur {
|
||||
match left {
|
||||
@@ -269,16 +305,23 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED};
|
||||
use query::Query;
|
||||
use schema::Field;
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use super::super::logical_ast::*;
|
||||
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.add_i64_field("signed", INT_INDEXED);
|
||||
schema_builder.add_u64_field("unsigned", INT_INDEXED);
|
||||
schema_builder.add_text_field("notindexed_text", STORED);
|
||||
schema_builder.add_text_field("notindexed_u64", STORED);
|
||||
schema_builder.add_text_field("notindexed_i64", STORED);
|
||||
schema_builder.add_text_field("nottokenized", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
QueryParser::new(schema, default_fields)
|
||||
@@ -309,6 +352,64 @@ mod test {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query("toto").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_nonindexed_field_yields_error() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
let is_not_indexed_err = |query: &str| {
|
||||
let result: Result<Box<Query>, QueryParserError> = query_parser.parse_query(query);
|
||||
if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
|
||||
Some(field_name.clone())
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64"))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, 101, 32, 119, 111, 114, 100, 116, 119, 111])",
|
||||
false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_ints() {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-9999999999999\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"18446744073709551615\"").is_err());
|
||||
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
||||
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
||||
assert!(query_parser.parse_query("unsigned:\"18446744073709551615\"").is_ok());
|
||||
test_parse_query_to_logical_ast_helper("unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("signed:-2324",
|
||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -51,7 +51,7 @@ impl FieldEntry {
|
||||
|
||||
|
||||
/// Returns the name of the field
|
||||
pub fn name(&self,) -> &String {
|
||||
pub fn name(&self,) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
|
||||
@@ -31,6 +31,20 @@ pub enum FieldType {
|
||||
|
||||
impl FieldType {
|
||||
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
match self {
|
||||
&FieldType::Str(ref text_options) => {
|
||||
text_options.get_indexing_options().is_indexed()
|
||||
}
|
||||
&FieldType::U64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
&FieldType::I64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a field value from json, given the target FieldType.
|
||||
///
|
||||
/// Tantivy will not try to cast values.
|
||||
|
||||
@@ -107,7 +107,7 @@ impl SchemaBuilder {
|
||||
/// Adds a field entry to the schema in build.
|
||||
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
||||
let field = Field(self.fields.len() as u32);
|
||||
let field_name = field_entry.name().clone();
|
||||
let field_name = field_entry.name().to_string();
|
||||
self.fields.push(field_entry);
|
||||
self.fields_map.insert(field_name, field);
|
||||
field
|
||||
@@ -173,7 +173,7 @@ impl Schema {
|
||||
}
|
||||
|
||||
/// Return the field name for a given `Field`.
|
||||
pub fn get_field_name(&self, field: Field) -> &String {
|
||||
pub fn get_field_name(&self, field: Field) -> &str {
|
||||
self.get_field_entry(field).name()
|
||||
}
|
||||
|
||||
@@ -205,7 +205,7 @@ impl Schema {
|
||||
.map(|field_val| field_val.value() )
|
||||
.cloned()
|
||||
.collect();
|
||||
field_map.insert(field_name.clone(), values);
|
||||
field_map.insert(field_name.to_string(), values);
|
||||
}
|
||||
NamedFieldDocument(field_map)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user