From 5d6c8de23e08d1451d789cf8534bd51bad0c5e3f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 9 Sep 2025 17:38:38 +0200 Subject: [PATCH] Align search float search logic to the columnar coercion rules It applies the same logic on floats as for u64 or i64. In all case, the idea is (for the inverted index) to coerce number to their canonical representation, before indexing and before searching. That way a document with the float 1.0 will be searchable when the user searches for 1. Note that contrary to the columnar, we do not attempt to coerce all of the terms associated to a given json path to a single numerical type. We simply rely on this "point-wise" canonicalization. --- columnar/src/value.rs | 76 ++++++++++++++++++++- src/core/json_utils.rs | 92 ++++++++++++++++++-------- src/lib.rs | 49 +++++++++++++- src/query/query_parser/query_parser.rs | 21 ++---- 4 files changed, 195 insertions(+), 43 deletions(-) diff --git a/columnar/src/value.rs b/columnar/src/value.rs index 81b5367ab..1a17ca05e 100644 --- a/columnar/src/value.rs +++ b/columnar/src/value.rs @@ -1,3 +1,5 @@ +use std::str::FromStr; + use common::DateTime; use crate::InvalidData; @@ -9,6 +11,23 @@ pub enum NumericalValue { F64(f64), } +impl FromStr for NumericalValue { + type Err = (); + + fn from_str(s: &str) -> Result { + if let Ok(val_i64) = s.parse::() { + return Ok(val_i64.into()); + } + if let Ok(val_u64) = s.parse::() { + return Ok(val_u64.into()); + } + if let Ok(val_f64) = s.parse::() { + return Ok(NumericalValue::from(val_f64).normalize()); + } + Err(()) + } +} + impl NumericalValue { pub fn numerical_type(&self) -> NumericalType { match self { @@ -26,7 +45,7 @@ impl NumericalValue { if val <= i64::MAX as u64 { NumericalValue::I64(val as i64) } else { - NumericalValue::F64(val as f64) + NumericalValue::U64(val) } } NumericalValue::I64(val) => NumericalValue::I64(val), @@ -141,6 +160,7 @@ impl Coerce for DateTime { #[cfg(test)] mod tests { use super::NumericalType; + use crate::NumericalValue; #[test] fn test_numerical_type_code() { @@ -153,4 +173,58 @@ mod tests { } assert_eq!(num_numerical_type, 3); } + + #[test] + fn test_parse_numerical() { + assert_eq!( + "123".parse::().unwrap(), + NumericalValue::I64(123) + ); + assert_eq!( + "18446744073709551615".parse::().unwrap(), + NumericalValue::U64(18446744073709551615u64) + ); + assert_eq!( + "1.0".parse::().unwrap(), + NumericalValue::I64(1i64) + ); + assert_eq!( + "1.1".parse::().unwrap(), + NumericalValue::F64(1.1f64) + ); + assert_eq!( + "-1.0".parse::().unwrap(), + NumericalValue::I64(-1i64) + ); + } + + #[test] + fn test_normalize_numerical() { + assert_eq!( + NumericalValue::from(1u64).normalize(), + NumericalValue::I64(1i64), + ); + let limit_val = i64::MAX as u64 + 1u64; + assert_eq!( + NumericalValue::from(limit_val).normalize(), + NumericalValue::U64(limit_val), + ); + assert_eq!( + NumericalValue::from(-1i64).normalize(), + NumericalValue::I64(-1i64), + ); + assert_eq!( + NumericalValue::from(-2.0f64).normalize(), + NumericalValue::I64(-2i64), + ); + assert_eq!( + NumericalValue::from(-2.1f64).normalize(), + NumericalValue::F64(-2.1f64), + ); + let large_float = 2.0f64.powf(70.0f64); + assert_eq!( + NumericalValue::from(large_float).normalize(), + NumericalValue::F64(large_float), + ); + } } diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 774bf4440..d40a3fcf6 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -1,3 +1,4 @@ +use columnar::NumericalValue; use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP}; use common::{replace_in_place, JsonPathWriter}; use rustc_hash::FxHashMap; @@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( if let Ok(i64_val) = val.try_into() { term_buffer.append_type_and_fast_value::(i64_val); } else { - term_buffer.append_type_and_fast_value(val); + term_buffer.append_type_and_fast_value::(val); } postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } @@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } ReferenceValueLeaf::F64(val) => { + if !val.is_finite() { + return; + }; set_path_id( term_buffer, ctx.path_to_unordered_id .get_or_allocate_unordered_id(json_path_writer.as_str()), ); - term_buffer.append_type_and_fast_value(val); + // Normalize here is important. + // In the inverted index, we coerce all numerical values to their canonical + // representation. + // + // (We do the same thing on the query side) + match NumericalValue::F64(val).normalize() { + NumericalValue::I64(val_i64) => { + term_buffer.append_type_and_fast_value::(val_i64); + } + NumericalValue::U64(val_u64) => { + term_buffer.append_type_and_fast_value::(val_u64); + } + NumericalValue::F64(val_f64) => { + term_buffer.append_type_and_fast_value::(val_f64); + } + } postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } ReferenceValueLeaf::Bool(val) => { @@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( /// /// The term must be json + JSON path. pub fn convert_to_fast_value_and_append_to_json_term( - mut term: Term, - phrase: &str, + term: &Term, + text: &str, truncate_date_for_search: bool, ) -> Option { assert_eq!( @@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term( 0, "JSON value bytes should be empty" ); - if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { - let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC)); - if truncate_date_for_search { - dt = dt.truncate(DATE_TIME_PRECISION_INDEXED); + try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search) + .or_else(|| try_convert_to_number_and_append_to_json_term(term, text)) + .or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text)) +} + +fn try_convert_to_datetime_and_append_to_json_term( + term: &Term, + text: &str, + truncate_date_for_search: bool, +) -> Option { + let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?; + let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC)); + if truncate_date_for_search { + dt = dt.truncate(DATE_TIME_PRECISION_INDEXED); + } + let mut term_clone = term.clone(); + term_clone.append_type_and_fast_value(dt); + Some(term_clone) +} + +fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option { + let numerical_value: NumericalValue = str::parse::(text).ok()?; + let mut term_clone = term.clone(); + // Parse is actually returning normalized values already today, but let's not + // not rely on that hidden contract. + match numerical_value.normalize() { + NumericalValue::I64(i64_value) => { + term_clone.append_type_and_fast_value::(i64_value); + } + NumericalValue::U64(u64_value) => { + term_clone.append_type_and_fast_value::(u64_value); + } + NumericalValue::F64(f64_value) => { + term_clone.append_type_and_fast_value::(f64_value); } - term.append_type_and_fast_value(dt); - return Some(term); } - if let Ok(i64_val) = str::parse::(phrase) { - term.append_type_and_fast_value(i64_val); - return Some(term); - } - if let Ok(u64_val) = str::parse::(phrase) { - term.append_type_and_fast_value(u64_val); - return Some(term); - } - if let Ok(f64_val) = str::parse::(phrase) { - term.append_type_and_fast_value(f64_val); - return Some(term); - } - if let Ok(bool_val) = str::parse::(phrase) { - term.append_type_and_fast_value(bool_val); - return Some(term); - } - None + Some(term_clone) +} + +fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option { + let val = str::parse::(text).ok()?; + let mut term_clone = term.clone(); + term_clone.append_type_and_fast_value(val); + Some(term_clone) } /// Splits a json path supplied to the query parser in such a way that diff --git a/src/lib.rs b/src/lib.rs index 41ceda695..789a8e953 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -370,6 +370,8 @@ macro_rules! fail_point { /// Common test utilities. #[cfg(test)] pub mod tests { + use std::collections::BTreeMap; + use common::{BinarySerializable, FixedSize}; use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral}; use rand::distributions::{Bernoulli, Uniform}; @@ -382,7 +384,7 @@ pub mod tests { use crate::index::SegmentReader; use crate::merge_policy::NoMergePolicy; use crate::postings::Postings; - use crate::query::BooleanQuery; + use crate::query::{BooleanQuery, QueryParser}; use crate::schema::*; use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy}; @@ -1223,4 +1225,49 @@ pub mod tests { ); assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro()); } + + #[test] + fn test_json_number_ambiguity() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("number", crate::schema::TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests().unwrap(); + { + let mut doc = TantivyDocument::new(); + let mut obj = BTreeMap::default(); + obj.insert("key".to_string(), OwnedValue::I64(1i64)); + doc.add_object(json_field, obj); + index_writer.add_document(doc).unwrap(); + } + { + let mut doc = TantivyDocument::new(); + let mut obj = BTreeMap::default(); + obj.insert("key".to_string(), OwnedValue::U64(1u64)); + doc.add_object(json_field, obj); + index_writer.add_document(doc).unwrap(); + } + { + let mut doc = TantivyDocument::new(); + let mut obj = BTreeMap::default(); + obj.insert("key".to_string(), OwnedValue::F64(1.0f64)); + doc.add_object(json_field, obj); + index_writer.add_document(doc).unwrap(); + } + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + assert_eq!(searcher.num_docs(), 3); + { + let parser = QueryParser::for_index(&index, vec![]); + let query = parser.parse_query("number.key:1").unwrap(); + let count = searcher.search(&query, &crate::collector::Count).unwrap(); + assert_eq!(count, 3); + } + { + let parser = QueryParser::for_index(&index, vec![]); + let query = parser.parse_query("number.key:1.0").unwrap(); + let count = searcher.search(&query, &crate::collector::Count).unwrap(); + assert_eq!(count, 3); + } + } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 62d15d8f5..c44de3886 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -495,24 +495,17 @@ impl QueryParser { Ok(terms.into_iter().next().unwrap()) } FieldType::JsonObject(ref json_options) => { - let get_term_with_path = || { - Term::from_field_json_path( - field, - json_path, - json_options.is_expand_dots_enabled(), - ) - }; + let mut term = Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ); if let Some(term) = // Try to convert the phrase to a fast value - convert_to_fast_value_and_append_to_json_term( - get_term_with_path(), - phrase, - false, - ) + convert_to_fast_value_and_append_to_json_term(&term, phrase, false) { Ok(term) } else { - let mut term = get_term_with_path(); term.append_type_and_str(phrase); Ok(term) } @@ -1028,7 +1021,7 @@ fn generate_literals_for_json_object( // Try to convert the phrase to a fast value if let Some(term) = - convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true) + convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true) { logical_literals.push(LogicalLiteral::Term(term)); }