diff --git a/columnar/src/value.rs b/columnar/src/value.rs index 81b5367ab..1a17ca05e 100644 --- a/columnar/src/value.rs +++ b/columnar/src/value.rs @@ -1,3 +1,5 @@ +use std::str::FromStr; + use common::DateTime; use crate::InvalidData; @@ -9,6 +11,23 @@ pub enum NumericalValue { F64(f64), } +impl FromStr for NumericalValue { + type Err = (); + + fn from_str(s: &str) -> Result { + if let Ok(val_i64) = s.parse::() { + return Ok(val_i64.into()); + } + if let Ok(val_u64) = s.parse::() { + return Ok(val_u64.into()); + } + if let Ok(val_f64) = s.parse::() { + return Ok(NumericalValue::from(val_f64).normalize()); + } + Err(()) + } +} + impl NumericalValue { pub fn numerical_type(&self) -> NumericalType { match self { @@ -26,7 +45,7 @@ impl NumericalValue { if val <= i64::MAX as u64 { NumericalValue::I64(val as i64) } else { - NumericalValue::F64(val as f64) + NumericalValue::U64(val) } } NumericalValue::I64(val) => NumericalValue::I64(val), @@ -141,6 +160,7 @@ impl Coerce for DateTime { #[cfg(test)] mod tests { use super::NumericalType; + use crate::NumericalValue; #[test] fn test_numerical_type_code() { @@ -153,4 +173,58 @@ mod tests { } assert_eq!(num_numerical_type, 3); } + + #[test] + fn test_parse_numerical() { + assert_eq!( + "123".parse::().unwrap(), + NumericalValue::I64(123) + ); + assert_eq!( + "18446744073709551615".parse::().unwrap(), + NumericalValue::U64(18446744073709551615u64) + ); + assert_eq!( + "1.0".parse::().unwrap(), + NumericalValue::I64(1i64) + ); + assert_eq!( + "1.1".parse::().unwrap(), + NumericalValue::F64(1.1f64) + ); + assert_eq!( + "-1.0".parse::().unwrap(), + NumericalValue::I64(-1i64) + ); + } + + #[test] + fn test_normalize_numerical() { + assert_eq!( + NumericalValue::from(1u64).normalize(), + NumericalValue::I64(1i64), + ); + let limit_val = i64::MAX as u64 + 1u64; + assert_eq!( + NumericalValue::from(limit_val).normalize(), + NumericalValue::U64(limit_val), + ); + assert_eq!( + NumericalValue::from(-1i64).normalize(), + NumericalValue::I64(-1i64), + ); + assert_eq!( + NumericalValue::from(-2.0f64).normalize(), + NumericalValue::I64(-2i64), + ); + assert_eq!( + NumericalValue::from(-2.1f64).normalize(), + NumericalValue::F64(-2.1f64), + ); + let large_float = 2.0f64.powf(70.0f64); + assert_eq!( + NumericalValue::from(large_float).normalize(), + NumericalValue::F64(large_float), + ); + } } diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 774bf4440..d40a3fcf6 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -1,3 +1,4 @@ +use columnar::NumericalValue; use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP}; use common::{replace_in_place, JsonPathWriter}; use rustc_hash::FxHashMap; @@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( if let Ok(i64_val) = val.try_into() { term_buffer.append_type_and_fast_value::(i64_val); } else { - term_buffer.append_type_and_fast_value(val); + term_buffer.append_type_and_fast_value::(val); } postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } @@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } ReferenceValueLeaf::F64(val) => { + if !val.is_finite() { + return; + }; set_path_id( term_buffer, ctx.path_to_unordered_id .get_or_allocate_unordered_id(json_path_writer.as_str()), ); - term_buffer.append_type_and_fast_value(val); + // Normalize here is important. + // In the inverted index, we coerce all numerical values to their canonical + // representation. + // + // (We do the same thing on the query side) + match NumericalValue::F64(val).normalize() { + NumericalValue::I64(val_i64) => { + term_buffer.append_type_and_fast_value::(val_i64); + } + NumericalValue::U64(val_u64) => { + term_buffer.append_type_and_fast_value::(val_u64); + } + NumericalValue::F64(val_f64) => { + term_buffer.append_type_and_fast_value::(val_f64); + } + } postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } ReferenceValueLeaf::Bool(val) => { @@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( /// /// The term must be json + JSON path. pub fn convert_to_fast_value_and_append_to_json_term( - mut term: Term, - phrase: &str, + term: &Term, + text: &str, truncate_date_for_search: bool, ) -> Option { assert_eq!( @@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term( 0, "JSON value bytes should be empty" ); - if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { - let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC)); - if truncate_date_for_search { - dt = dt.truncate(DATE_TIME_PRECISION_INDEXED); + try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search) + .or_else(|| try_convert_to_number_and_append_to_json_term(term, text)) + .or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text)) +} + +fn try_convert_to_datetime_and_append_to_json_term( + term: &Term, + text: &str, + truncate_date_for_search: bool, +) -> Option { + let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?; + let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC)); + if truncate_date_for_search { + dt = dt.truncate(DATE_TIME_PRECISION_INDEXED); + } + let mut term_clone = term.clone(); + term_clone.append_type_and_fast_value(dt); + Some(term_clone) +} + +fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option { + let numerical_value: NumericalValue = str::parse::(text).ok()?; + let mut term_clone = term.clone(); + // Parse is actually returning normalized values already today, but let's not + // not rely on that hidden contract. + match numerical_value.normalize() { + NumericalValue::I64(i64_value) => { + term_clone.append_type_and_fast_value::(i64_value); + } + NumericalValue::U64(u64_value) => { + term_clone.append_type_and_fast_value::(u64_value); + } + NumericalValue::F64(f64_value) => { + term_clone.append_type_and_fast_value::(f64_value); } - term.append_type_and_fast_value(dt); - return Some(term); } - if let Ok(i64_val) = str::parse::(phrase) { - term.append_type_and_fast_value(i64_val); - return Some(term); - } - if let Ok(u64_val) = str::parse::(phrase) { - term.append_type_and_fast_value(u64_val); - return Some(term); - } - if let Ok(f64_val) = str::parse::(phrase) { - term.append_type_and_fast_value(f64_val); - return Some(term); - } - if let Ok(bool_val) = str::parse::(phrase) { - term.append_type_and_fast_value(bool_val); - return Some(term); - } - None + Some(term_clone) +} + +fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option { + let val = str::parse::(text).ok()?; + let mut term_clone = term.clone(); + term_clone.append_type_and_fast_value(val); + Some(term_clone) } /// Splits a json path supplied to the query parser in such a way that diff --git a/src/lib.rs b/src/lib.rs index 41ceda695..789a8e953 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -370,6 +370,8 @@ macro_rules! fail_point { /// Common test utilities. #[cfg(test)] pub mod tests { + use std::collections::BTreeMap; + use common::{BinarySerializable, FixedSize}; use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral}; use rand::distributions::{Bernoulli, Uniform}; @@ -382,7 +384,7 @@ pub mod tests { use crate::index::SegmentReader; use crate::merge_policy::NoMergePolicy; use crate::postings::Postings; - use crate::query::BooleanQuery; + use crate::query::{BooleanQuery, QueryParser}; use crate::schema::*; use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy}; @@ -1223,4 +1225,49 @@ pub mod tests { ); assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro()); } + + #[test] + fn test_json_number_ambiguity() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("number", crate::schema::TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests().unwrap(); + { + let mut doc = TantivyDocument::new(); + let mut obj = BTreeMap::default(); + obj.insert("key".to_string(), OwnedValue::I64(1i64)); + doc.add_object(json_field, obj); + index_writer.add_document(doc).unwrap(); + } + { + let mut doc = TantivyDocument::new(); + let mut obj = BTreeMap::default(); + obj.insert("key".to_string(), OwnedValue::U64(1u64)); + doc.add_object(json_field, obj); + index_writer.add_document(doc).unwrap(); + } + { + let mut doc = TantivyDocument::new(); + let mut obj = BTreeMap::default(); + obj.insert("key".to_string(), OwnedValue::F64(1.0f64)); + doc.add_object(json_field, obj); + index_writer.add_document(doc).unwrap(); + } + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + assert_eq!(searcher.num_docs(), 3); + { + let parser = QueryParser::for_index(&index, vec![]); + let query = parser.parse_query("number.key:1").unwrap(); + let count = searcher.search(&query, &crate::collector::Count).unwrap(); + assert_eq!(count, 3); + } + { + let parser = QueryParser::for_index(&index, vec![]); + let query = parser.parse_query("number.key:1.0").unwrap(); + let count = searcher.search(&query, &crate::collector::Count).unwrap(); + assert_eq!(count, 3); + } + } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 62d15d8f5..c44de3886 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -495,24 +495,17 @@ impl QueryParser { Ok(terms.into_iter().next().unwrap()) } FieldType::JsonObject(ref json_options) => { - let get_term_with_path = || { - Term::from_field_json_path( - field, - json_path, - json_options.is_expand_dots_enabled(), - ) - }; + let mut term = Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ); if let Some(term) = // Try to convert the phrase to a fast value - convert_to_fast_value_and_append_to_json_term( - get_term_with_path(), - phrase, - false, - ) + convert_to_fast_value_and_append_to_json_term(&term, phrase, false) { Ok(term) } else { - let mut term = get_term_with_path(); term.append_type_and_str(phrase); Ok(term) } @@ -1028,7 +1021,7 @@ fn generate_literals_for_json_object( // Try to convert the phrase to a fast value if let Some(term) = - convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true) + convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true) { logical_literals.push(LogicalLiteral::Term(term)); }