Merge pull request #2692 from quickwit-oss/paul.masurel/coerce-floats-too-in-search-too

This PR changes the logic used on the ingestion of floats.
This commit is contained in:
Paul Masurel
2025-09-10 09:46:54 +02:00
committed by GitHub
4 changed files with 195 additions and 43 deletions

View File

@@ -1,3 +1,5 @@
use std::str::FromStr;
use common::DateTime;
use crate::InvalidData;
@@ -9,6 +11,23 @@ pub enum NumericalValue {
F64(f64),
}
impl FromStr for NumericalValue {
type Err = ();
fn from_str(s: &str) -> Result<Self, ()> {
if let Ok(val_i64) = s.parse::<i64>() {
return Ok(val_i64.into());
}
if let Ok(val_u64) = s.parse::<u64>() {
return Ok(val_u64.into());
}
if let Ok(val_f64) = s.parse::<f64>() {
return Ok(NumericalValue::from(val_f64).normalize());
}
Err(())
}
}
impl NumericalValue {
pub fn numerical_type(&self) -> NumericalType {
match self {
@@ -26,7 +45,7 @@ impl NumericalValue {
if val <= i64::MAX as u64 {
NumericalValue::I64(val as i64)
} else {
NumericalValue::F64(val as f64)
NumericalValue::U64(val)
}
}
NumericalValue::I64(val) => NumericalValue::I64(val),
@@ -141,6 +160,7 @@ impl Coerce for DateTime {
#[cfg(test)]
mod tests {
use super::NumericalType;
use crate::NumericalValue;
#[test]
fn test_numerical_type_code() {
@@ -153,4 +173,58 @@ mod tests {
}
assert_eq!(num_numerical_type, 3);
}
#[test]
fn test_parse_numerical() {
assert_eq!(
"123".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(123)
);
assert_eq!(
"18446744073709551615".parse::<NumericalValue>().unwrap(),
NumericalValue::U64(18446744073709551615u64)
);
assert_eq!(
"1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(1i64)
);
assert_eq!(
"1.1".parse::<NumericalValue>().unwrap(),
NumericalValue::F64(1.1f64)
);
assert_eq!(
"-1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(-1i64)
);
}
#[test]
fn test_normalize_numerical() {
assert_eq!(
NumericalValue::from(1u64).normalize(),
NumericalValue::I64(1i64),
);
let limit_val = i64::MAX as u64 + 1u64;
assert_eq!(
NumericalValue::from(limit_val).normalize(),
NumericalValue::U64(limit_val),
);
assert_eq!(
NumericalValue::from(-1i64).normalize(),
NumericalValue::I64(-1i64),
);
assert_eq!(
NumericalValue::from(-2.0f64).normalize(),
NumericalValue::I64(-2i64),
);
assert_eq!(
NumericalValue::from(-2.1f64).normalize(),
NumericalValue::F64(-2.1f64),
);
let large_float = 2.0f64.powf(70.0f64);
assert_eq!(
NumericalValue::from(large_float).normalize(),
NumericalValue::F64(large_float),
);
}
}

View File

@@ -1,3 +1,4 @@
use columnar::NumericalValue;
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
@@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value(val);
term_buffer.append_type_and_fast_value::<u64>(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
@@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::F64(val) => {
if !val.is_finite() {
return;
};
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
// Normalize here is important.
// In the inverted index, we coerce all numerical values to their canonical
// representation.
//
// (We do the same thing on the query side)
match NumericalValue::F64(val).normalize() {
NumericalValue::I64(val_i64) => {
term_buffer.append_type_and_fast_value::<i64>(val_i64);
}
NumericalValue::U64(val_u64) => {
term_buffer.append_type_and_fast_value::<u64>(val_u64);
}
NumericalValue::F64(val_f64) => {
term_buffer.append_type_and_fast_value::<f64>(val_f64);
}
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Bool(val) => {
@@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
///
/// The term must be json + JSON path.
pub fn convert_to_fast_value_and_append_to_json_term(
mut term: Term,
phrase: &str,
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
assert_eq!(
@@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term(
0,
"JSON value bytes should be empty"
);
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
}
fn try_convert_to_datetime_and_append_to_json_term(
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
}
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(dt);
Some(term_clone)
}
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
let mut term_clone = term.clone();
// Parse is actually returning normalized values already today, but let's not
// not rely on that hidden contract.
match numerical_value.normalize() {
NumericalValue::I64(i64_value) => {
term_clone.append_type_and_fast_value::<i64>(i64_value);
}
NumericalValue::U64(u64_value) => {
term_clone.append_type_and_fast_value::<u64>(u64_value);
}
NumericalValue::F64(f64_value) => {
term_clone.append_type_and_fast_value::<f64>(f64_value);
}
term.append_type_and_fast_value(dt);
return Some(term);
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
term.append_type_and_fast_value(i64_val);
return Some(term);
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
term.append_type_and_fast_value(u64_val);
return Some(term);
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
term.append_type_and_fast_value(f64_val);
return Some(term);
}
if let Ok(bool_val) = str::parse::<bool>(phrase) {
term.append_type_and_fast_value(bool_val);
return Some(term);
}
None
Some(term_clone)
}
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
let val = str::parse::<bool>(text).ok()?;
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(val);
Some(term_clone)
}
/// Splits a json path supplied to the query parser in such a way that

View File

@@ -370,6 +370,8 @@ macro_rules! fail_point {
/// Common test utilities.
#[cfg(test)]
pub mod tests {
use std::collections::BTreeMap;
use common::{BinarySerializable, FixedSize};
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
use rand::distributions::{Bernoulli, Uniform};
@@ -382,7 +384,7 @@ pub mod tests {
use crate::index::SegmentReader;
use crate::merge_policy::NoMergePolicy;
use crate::postings::Postings;
use crate::query::BooleanQuery;
use crate::query::{BooleanQuery, QueryParser};
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
@@ -1223,4 +1225,49 @@ pub mod tests {
);
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
}
#[test]
fn test_json_number_ambiguity() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::I64(1i64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::U64(1u64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.num_docs(), 3);
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1.0").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
}
}

View File

@@ -495,24 +495,17 @@ impl QueryParser {
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(ref json_options) => {
let get_term_with_path = || {
Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
)
};
let mut term = Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
);
if let Some(term) =
// Try to convert the phrase to a fast value
convert_to_fast_value_and_append_to_json_term(
get_term_with_path(),
phrase,
false,
)
convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
{
Ok(term)
} else {
let mut term = get_term_with_path();
term.append_type_and_str(phrase);
Ok(term)
}
@@ -1028,7 +1021,7 @@ fn generate_literals_for_json_object(
// Try to convert the phrase to a fast value
if let Some(term) =
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
{
logical_literals.push(LogicalLiteral::Term(term));
}