issues/65 Phrase query for untokenized fields are not tokenized.

This commit is contained in:
Paul Masurel
2017-05-06 15:23:01 +09:00
parent 05111599b3
commit 2947364ae1
9 changed files with 179 additions and 44 deletions

View File

@@ -2,8 +2,11 @@ Tantivy 0.4.0
==========================
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
-
- QueryParser:
- Explicit error returned when searched for a term that is not indexed
- Searching for a int term via the query parser was broken `(age:1)`
- Searching for a non-indexed field returns an explicit Error
- Phrase query for non-tokenized field are not tokenized by the query parser.
Tantivy 0.3.1
==========================

View File

@@ -16,7 +16,7 @@ impl FastFieldNotAvailableError {
/// for which fast fields are not available.
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
FastFieldNotAvailableError {
field_name: field_entry.name().clone(),
field_name: field_entry.name().to_string(),
}
}
}

View File

@@ -12,6 +12,7 @@ pub struct FastFieldsWriter {
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let field_writers: Vec<IntFastFieldWriter> = schema
.fields()
@@ -47,6 +48,9 @@ impl FastFieldsWriter {
}
}
/// Returns a `FastFieldsWriter`
/// with a `IntFastFieldWriter` for each
/// of the field given in argument.
pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
FastFieldsWriter {
field_writers: fields

View File

@@ -374,14 +374,14 @@ impl SegmentUpdater {
self.run_async(move |segment_updater| {
debug!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
let mut file_protection_opt = None;
let mut _file_protection_opt = None;
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.0.index.opstamp();
if delete_operation.opstamp < committed_opstamp {
let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
match advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp) {
Ok(file_protection_opt_res) => {
file_protection_opt = file_protection_opt_res;
_file_protection_opt = file_protection_opt_res;
}
Err(e) => {
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", before_merge_segment_ids, e);

View File

@@ -10,8 +10,21 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
phrase.or(word)
};
let field = many1(letter());
let term_query = (field, char(':'), term_val()).map(|(field_name, _, phrase)| {
let negative_numbers =
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let field =
(
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let term_val_with_field = negative_numbers.or(term_val());
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
UserInputLiteral {
field_name: Some(field_name),
phrase: phrase,

View File

@@ -10,7 +10,9 @@ use postings::SegmentPostingsOption;
use query::PhraseQuery;
use analyzer::SimpleTokenizer;
use analyzer::StreamingIterator;
use schema::Term;
use schema::{Term, FieldType};
use std::str::FromStr;
use std::num::ParseIntError;
@@ -24,15 +26,23 @@ pub enum QueryParserError {
FieldDoesNotExist(String),
/// The query contains a term for a `u64`-field, but the value
/// is not a u64.
ExpectedU64(String, String),
ExpectedInt(ParseIntError),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
AllButQueryForbidden,
/// If no default field is declared, running a query without any
/// field specified is forbbidden.
NoDefaultFieldDeclared,
/// The field searched for is not declared
/// as indexed in the schema.
FieldNotIndexed(String),
}
impl From<ParseIntError> for QueryParserError {
fn from(err: ParseIntError) -> QueryParserError {
QueryParserError::ExpectedInt(err)
}
}
/// Tantivy's Query parser
///
@@ -121,7 +131,7 @@ impl QueryParser {
fn compute_logical_ast(&self,
user_input_ast: UserInputAST)
-> Result<LogicalAST, QueryParserError> {
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
if occur == Occur::MustNot {
return Err(QueryParserError::AllButQueryForbidden);
}
@@ -132,25 +142,51 @@ impl QueryParser {
field: Field,
phrase: &str)
-> Result<Option<LogicalLiteral>, QueryParserError> {
let mut token_iter = self.analyzer.tokenize(phrase);
let mut tokens: Vec<Term> = Vec::new();
loop {
if let Some(token) = token_iter.next() {
let text = token.to_string();
// TODO Handle u64
let term = Term::from_field_text(field, &text);
tokens.push(term);
} else {
break;
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
if !field_type.is_indexed() {
let field_name = field_entry.name().to_string();
return Err(QueryParserError::FieldNotIndexed(field_name));
}
match field_type {
&FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
let term = Term::from_field_i64(field, val);
return Ok(Some(LogicalLiteral::Term(term)));
}
&FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
let term = Term::from_field_u64(field, val);
return Ok(Some(LogicalLiteral::Term(term)));
}
&FieldType::Str(ref str_options) => {
let mut terms: Vec<Term> = Vec::new();
if str_options.get_indexing_options().is_tokenized() {
let mut token_iter = self.analyzer.tokenize(phrase);
loop {
if let Some(token) = token_iter.next() {
let term = Term::from_field_text(field, token);
terms.push(term);
} else {
break;
}
}
}
else {
terms.push(Term::from_field_text(field, phrase));
}
if terms.is_empty() {
return Ok(None);
}
else if terms.len() == 1 {
return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
} else {
return Ok(Some(LogicalLiteral::Phrase(terms)))
}
}
}
if tokens.is_empty() {
Ok(None)
} else if tokens.len() == 1 {
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
} else {
Ok(Some(LogicalLiteral::Phrase(tokens)))
}
}
fn default_occur(&self) -> Occur {
@@ -208,23 +244,23 @@ impl QueryParser {
asts.push(LogicalAST::Leaf(box ast));
}
}
let result_ast = if asts.len() == 0 {
// this should never happen
return Err(QueryParserError::SyntaxError);
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
let result_ast =
if asts.len() == 0 {
// this should never happen
return Err(QueryParserError::SyntaxError);
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
Ok((Occur::Should, result_ast))
}
}
}
}
/// Compose two occur values.
fn compose_occur(left: Occur, right: Occur) -> Occur {
match left {
@@ -269,16 +305,23 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
#[cfg(test)]
mod test {
use schema::{SchemaBuilder, TEXT};
use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED};
use query::Query;
use schema::Field;
use super::QueryParser;
use super::QueryParserError;
use super::super::logical_ast::*;
fn make_query_parser() -> QueryParser {
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
schema_builder.add_i64_field("signed", INT_INDEXED);
schema_builder.add_u64_field("unsigned", INT_INDEXED);
schema_builder.add_text_field("notindexed_text", STORED);
schema_builder.add_text_field("notindexed_u64", STORED);
schema_builder.add_text_field("notindexed_i64", STORED);
schema_builder.add_text_field("nottokenized", STRING);
let schema = schema_builder.build();
let default_fields = vec![title, text];
QueryParser::new(schema, default_fields)
@@ -309,6 +352,64 @@ mod test {
let query_parser = make_query_parser();
assert!(query_parser.parse_query("toto").is_ok());
}
#[test]
pub fn test_parse_nonindexed_field_yields_error() {
let query_parser = make_query_parser();
let is_not_indexed_err = |query: &str| {
let result: Result<Box<Query>, QueryParserError> = query_parser.parse_query(query);
if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
Some(field_name.clone())
}
else {
None
}
};
assert_eq!(
is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text"))
);
assert_eq!(
is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64"))
);
assert_eq!(
is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64"))
);
}
#[test]
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, 101, 32, 119, 111, 114, 100, 116, 119, 111])",
false);
}
#[test]
pub fn test_parse_query_ints() {
let query_parser = make_query_parser();
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser.parse_query("signed:\"-9999999999999\"").is_ok());
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser.parse_query("signed:\"18446744073709551615\"").is_err());
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser.parse_query("unsigned:\"18446744073709551615\"").is_ok());
test_parse_query_to_logical_ast_helper("unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false);
test_parse_query_to_logical_ast_helper("signed:-2324",
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
false);
}
#[test]

View File

@@ -51,7 +51,7 @@ impl FieldEntry {
/// Returns the name of the field
pub fn name(&self,) -> &String {
pub fn name(&self,) -> &str {
&self.name
}

View File

@@ -31,6 +31,20 @@ pub enum FieldType {
impl FieldType {
pub fn is_indexed(&self) -> bool {
match self {
&FieldType::Str(ref text_options) => {
text_options.get_indexing_options().is_indexed()
}
&FieldType::U64(ref int_options) => {
int_options.is_indexed()
}
&FieldType::I64(ref int_options) => {
int_options.is_indexed()
}
}
}
/// Parses a field value from json, given the target FieldType.
///
/// Tantivy will not try to cast values.

View File

@@ -107,7 +107,7 @@ impl SchemaBuilder {
/// Adds a field entry to the schema in build.
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
let field = Field(self.fields.len() as u32);
let field_name = field_entry.name().clone();
let field_name = field_entry.name().to_string();
self.fields.push(field_entry);
self.fields_map.insert(field_name, field);
field
@@ -173,7 +173,7 @@ impl Schema {
}
/// Return the field name for a given `Field`.
pub fn get_field_name(&self, field: Field) -> &String {
pub fn get_field_name(&self, field: Field) -> &str {
self.get_field_entry(field).name()
}
@@ -205,7 +205,7 @@ impl Schema {
.map(|field_val| field_val.value() )
.cloned()
.collect();
field_map.insert(field_name.clone(), values);
field_map.insert(field_name.to_string(), values);
}
NamedFieldDocument(field_map)
}