issues/65 Phrase query for untokenized fields are not tokenized.

2026-01-04 08:12:54 +00:00 · 2017-05-06 15:23:01 +09:00
parent 05111599b3
commit 2947364ae1
9 changed files with 179 additions and 44 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,8 +2,11 @@ Tantivy 0.4.0
 ==========================

 - Removed u32 fields. They are replaced by u64 and i64 fields (#65)
- 
-
+- QueryParser:
+  - Explicit error returned when searched for a term that is not indexed
+  - Searching for a int term via the query parser was broken `(age:1)`
+  - Searching for a non-indexed field returns an explicit Error
+  - Phrase query for non-tokenized field are not tokenized by the query parser.

 Tantivy 0.3.1
 ==========================
--- a/src/fastfield/error.rs
+++ b/src/fastfield/error.rs
@@ -16,7 +16,7 @@ impl FastFieldNotAvailableError {
    /// for which fast fields are not available.
    pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
        FastFieldNotAvailableError {
-            field_name: field_entry.name().clone(),
+            field_name: field_entry.name().to_string(),
        }
    }
 }
--- a/src/fastfield/writer.rs
+++ b/src/fastfield/writer.rs
@@ -12,6 +12,7 @@ pub struct FastFieldsWriter {

 impl FastFieldsWriter {

+    /// Create all `FastFieldWriter` required by the schema.
    pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
        let field_writers: Vec<IntFastFieldWriter> = schema
            .fields()
@@ -47,6 +48,9 @@ impl FastFieldsWriter {
        }
    }
    
+    /// Returns a `FastFieldsWriter` 
+    /// with a `IntFastFieldWriter` for each 
+    /// of the field given in argument.
    pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
        FastFieldsWriter {
            field_writers: fields
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -374,14 +374,14 @@ impl SegmentUpdater {
        self.run_async(move |segment_updater| {
            debug!("End merge {:?}", after_merge_segment_entry.meta());
            let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
-            let mut file_protection_opt = None;
+            let mut _file_protection_opt = None;
            if let Some(delete_operation) = delete_cursor.get() {
                let committed_opstamp = segment_updater.0.index.opstamp();
                if delete_operation.opstamp < committed_opstamp {
                    let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
                    match advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp) {
                        Ok(file_protection_opt_res) => {
-                            file_protection_opt = file_protection_opt_res;
+                            _file_protection_opt = file_protection_opt_res;
                        }
                        Err(e) => {
                            error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", before_merge_segment_ids, e);
--- a/src/query/query_parser/query_grammar.rs
+++ b/src/query/query_parser/query_grammar.rs
@@ -10,8 +10,21 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
        let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
        phrase.or(word)
    };
-    let field = many1(letter());
-    let term_query = (field, char(':'), term_val()).map(|(field_name, _, phrase)| {
+
+    let negative_numbers = 
+        (char('-'), many1(satisfy(|c: char| c.is_numeric())))
+        .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
+    
+    let field =
+        (
+            letter(),
+            many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))
+        )
+        .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
+
+    let term_val_with_field = negative_numbers.or(term_val());
+
+    let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
        UserInputLiteral {
            field_name: Some(field_name),
            phrase: phrase,
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -10,7 +10,9 @@ use postings::SegmentPostingsOption;
 use query::PhraseQuery;
 use analyzer::SimpleTokenizer;
 use analyzer::StreamingIterator;
-use schema::Term;
+use schema::{Term, FieldType};
+use std::str::FromStr;
+use std::num::ParseIntError;



@@ -24,15 +26,23 @@ pub enum QueryParserError {
    FieldDoesNotExist(String),
    /// The query contains a term for a `u64`-field, but the value
    /// is not a u64.
-    ExpectedU64(String, String),
+    ExpectedInt(ParseIntError),
    /// It is forbidden queries that are only "excluding". (e.g. -title:pop)
    AllButQueryForbidden,
    /// If no default field is declared, running a query without any
    /// field specified is forbbidden.
    NoDefaultFieldDeclared,
+    /// The field searched for is not declared
+    /// as indexed in the schema.
+    FieldNotIndexed(String),
 }


+impl From<ParseIntError> for QueryParserError {
+    fn from(err: ParseIntError) -> QueryParserError {
+        QueryParserError::ExpectedInt(err)
+    }
+}

 /// Tantivy's Query parser
 ///
@@ -121,7 +131,7 @@ impl QueryParser {
    fn compute_logical_ast(&self,
                               user_input_ast: UserInputAST)
                               -> Result<LogicalAST, QueryParserError> {
-        let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
+        let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
        if occur == Occur::MustNot {
            return Err(QueryParserError::AllButQueryForbidden);
        }
@@ -132,25 +142,51 @@ impl QueryParser {
                                    field: Field,
                                    phrase: &str)
                                    -> Result<Option<LogicalLiteral>, QueryParserError> {
-        let mut token_iter = self.analyzer.tokenize(phrase);
-        let mut tokens: Vec<Term> = Vec::new();
-        loop {
-            if let Some(token) = token_iter.next() {
-                let text = token.to_string();
-                // TODO Handle u64
-                let term = Term::from_field_text(field, &text);
-                tokens.push(term);
-            } else {
-                break;
+
+        let field_entry = self.schema.get_field_entry(field);
+        let field_type = field_entry.field_type();
+        if !field_type.is_indexed() {
+            let field_name = field_entry.name().to_string();
+            return Err(QueryParserError::FieldNotIndexed(field_name));
+        }
+        match field_type {
+            &FieldType::I64(_) => {
+                let val: i64 = i64::from_str(phrase)?;
+                let term = Term::from_field_i64(field, val);
+                return Ok(Some(LogicalLiteral::Term(term)));
+            }
+            &FieldType::U64(_) => {
+                let val: u64 = u64::from_str(phrase)?;
+                let term = Term::from_field_u64(field, val);
+                return Ok(Some(LogicalLiteral::Term(term)));
+            }
+            &FieldType::Str(ref str_options) => {
+                let mut terms: Vec<Term> = Vec::new();
+                if str_options.get_indexing_options().is_tokenized() {
+                    let mut token_iter = self.analyzer.tokenize(phrase);
+                    loop {
+                        if let Some(token) = token_iter.next() {
+                            let term = Term::from_field_text(field, token);
+                            terms.push(term);
+                        } else {
+                            break;
+                        }
+                    }
+                }
+                else {
+                    terms.push(Term::from_field_text(field, phrase));
+                }
+                if terms.is_empty() {
+                    return Ok(None);
+                }
+                else if terms.len() == 1 {
+                    return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
+                } else {
+                    return Ok(Some(LogicalLiteral::Phrase(terms)))
+                }
            }
        }
-        if tokens.is_empty() {
-            Ok(None)
-        } else if tokens.len() == 1 {
-            Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
-        } else {
-            Ok(Some(LogicalLiteral::Phrase(tokens)))
-        }
+        
    }

    fn default_occur(&self) -> Occur {
@@ -208,23 +244,23 @@ impl QueryParser {
                        asts.push(LogicalAST::Leaf(box ast));
                    }
                }
-                let result_ast = if asts.len() == 0 {
-                    // this should never happen
-                    return Err(QueryParserError::SyntaxError); 
-                } else if asts.len() == 1 {
-                    asts[0].clone()
-                } else {
-                    LogicalAST::Clause(asts.into_iter()
-                        .map(|ast| (Occur::Should, ast))
-                        .collect())
-                };
+                let result_ast =
+                    if asts.len() == 0 {
+                        // this should never happen
+                        return Err(QueryParserError::SyntaxError); 
+                    } else if asts.len() == 1 {
+                        asts[0].clone()
+                    } else {
+                        LogicalAST::Clause(asts.into_iter()
+                            .map(|ast| (Occur::Should, ast))
+                            .collect())
+                    };
                Ok((Occur::Should, result_ast))
            }
        }
    }
 }

-
 /// Compose two occur values.
 fn compose_occur(left: Occur, right: Occur) -> Occur {
    match left {
@@ -269,16 +305,23 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {

 #[cfg(test)]
 mod test {
-    use schema::{SchemaBuilder, TEXT};
+    use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED};
+    use query::Query;
+    use schema::Field;
    use super::QueryParser;
    use super::QueryParserError;
    use super::super::logical_ast::*;
    
-    
    fn make_query_parser() -> QueryParser {
        let mut schema_builder = SchemaBuilder::default();
        let title = schema_builder.add_text_field("title", TEXT);
        let text = schema_builder.add_text_field("text", TEXT);
+        schema_builder.add_i64_field("signed", INT_INDEXED);
+        schema_builder.add_u64_field("unsigned", INT_INDEXED);
+        schema_builder.add_text_field("notindexed_text", STORED);
+        schema_builder.add_text_field("notindexed_u64", STORED);
+        schema_builder.add_text_field("notindexed_i64", STORED);
+        schema_builder.add_text_field("nottokenized", STRING);
        let schema = schema_builder.build();
        let default_fields = vec![title, text];
        QueryParser::new(schema, default_fields)
@@ -309,6 +352,64 @@ mod test {
        let query_parser = make_query_parser();
        assert!(query_parser.parse_query("toto").is_ok()); 
    }
+
+    #[test]
+    pub fn test_parse_nonindexed_field_yields_error() {
+        let query_parser = make_query_parser();
+        
+        let is_not_indexed_err = |query: &str| {
+            let result: Result<Box<Query>, QueryParserError> = query_parser.parse_query(query);
+            if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
+                Some(field_name.clone())
+            }
+            else {
+                None
+            }
+        };
+
+        assert_eq!(
+            is_not_indexed_err("notindexed_text:titi"),
+            Some(String::from("notindexed_text"))
+        );
+        assert_eq!(
+            is_not_indexed_err("notindexed_u64:23424"),
+            Some(String::from("notindexed_u64"))
+        );
+        assert_eq!(
+            is_not_indexed_err("notindexed_i64:-234324"),
+            Some(String::from("notindexed_i64"))
+        );
+    }
+
+
+    #[test]
+    pub fn test_parse_query_untokenized() {
+        test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
+                                               "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, 101, 32, 119, 111, 114, 100, 116, 119, 111])",
+                                               false);
+    }
+
+    #[test]
+    pub fn test_parse_query_ints() {
+        let query_parser = make_query_parser();
+        assert!(query_parser.parse_query("signed:2324").is_ok()); 
+        assert!(query_parser.parse_query("signed:\"22\"").is_ok()); 
+        assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
+        assert!(query_parser.parse_query("signed:\"-9999999999999\"").is_ok());
+        assert!(query_parser.parse_query("signed:\"a\"").is_err());
+        assert!(query_parser.parse_query("signed:\"2a\"").is_err());
+        assert!(query_parser.parse_query("signed:\"18446744073709551615\"").is_err());
+        assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
+        assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
+        assert!(query_parser.parse_query("unsigned:\"18446744073709551615\"").is_ok());
+        test_parse_query_to_logical_ast_helper("unsigned:2324",
+                                               "Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
+                                               false);
+
+        test_parse_query_to_logical_ast_helper("signed:-2324",
+                                               &format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
+                                               false);
+    }
    

    #[test]
--- a/src/schema/field_entry.rs
+++ b/src/schema/field_entry.rs
@@ -51,7 +51,7 @@ impl FieldEntry {
    

    /// Returns the name of the field
-    pub fn name(&self,) -> &String {
+    pub fn name(&self,) -> &str {
        &self.name
    }
        
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -31,6 +31,20 @@ pub enum FieldType {

 impl FieldType {
    
+    pub fn is_indexed(&self) -> bool {
+        match self {
+            &FieldType::Str(ref text_options) => {
+                text_options.get_indexing_options().is_indexed()
+            }
+            &FieldType::U64(ref int_options) => {
+                int_options.is_indexed()
+            }
+            &FieldType::I64(ref int_options) => {
+                int_options.is_indexed()
+            }
+        }
+    }
+
    /// Parses a field value from json, given the target FieldType.
    ///
    /// Tantivy will not try to cast values.
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -107,7 +107,7 @@ impl SchemaBuilder {
    /// Adds a field entry to the schema in build.
    fn add_field(&mut self, field_entry: FieldEntry) -> Field {
        let field = Field(self.fields.len() as u32);
-        let field_name = field_entry.name().clone();
+        let field_name = field_entry.name().to_string();
        self.fields.push(field_entry);
        self.fields_map.insert(field_name, field);
        field
@@ -173,7 +173,7 @@ impl Schema {
    }
    
    /// Return the field name for a given `Field`.
-    pub fn get_field_name(&self, field: Field) -> &String {
+    pub fn get_field_name(&self, field: Field) -> &str {
        self.get_field_entry(field).name()
    }
    
@@ -205,7 +205,7 @@ impl Schema {
                .map(|field_val| field_val.value() )
                .cloned()
                .collect();
-            field_map.insert(field_name.clone(), values);
+            field_map.insert(field_name.to_string(), values);
        }
        NamedFieldDocument(field_map)
    }