mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Align search float search logic to the columnar coercion rules
It applies the same logic on floats as for u64 or i64. In all case, the idea is (for the inverted index) to coerce number to their canonical representation, before indexing and before searching. That way a document with the float 1.0 will be searchable when the user searches for 1. Note that contrary to the columnar, we do not attempt to coerce all of the terms associated to a given json path to a single numerical type. We simply rely on this "point-wise" canonicalization.
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
use common::DateTime;
|
use common::DateTime;
|
||||||
|
|
||||||
use crate::InvalidData;
|
use crate::InvalidData;
|
||||||
@@ -9,6 +11,23 @@ pub enum NumericalValue {
|
|||||||
F64(f64),
|
F64(f64),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl FromStr for NumericalValue {
|
||||||
|
type Err = ();
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, ()> {
|
||||||
|
if let Ok(val_i64) = s.parse::<i64>() {
|
||||||
|
return Ok(val_i64.into());
|
||||||
|
}
|
||||||
|
if let Ok(val_u64) = s.parse::<u64>() {
|
||||||
|
return Ok(val_u64.into());
|
||||||
|
}
|
||||||
|
if let Ok(val_f64) = s.parse::<f64>() {
|
||||||
|
return Ok(NumericalValue::from(val_f64).normalize());
|
||||||
|
}
|
||||||
|
Err(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl NumericalValue {
|
impl NumericalValue {
|
||||||
pub fn numerical_type(&self) -> NumericalType {
|
pub fn numerical_type(&self) -> NumericalType {
|
||||||
match self {
|
match self {
|
||||||
@@ -26,7 +45,7 @@ impl NumericalValue {
|
|||||||
if val <= i64::MAX as u64 {
|
if val <= i64::MAX as u64 {
|
||||||
NumericalValue::I64(val as i64)
|
NumericalValue::I64(val as i64)
|
||||||
} else {
|
} else {
|
||||||
NumericalValue::F64(val as f64)
|
NumericalValue::U64(val)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NumericalValue::I64(val) => NumericalValue::I64(val),
|
NumericalValue::I64(val) => NumericalValue::I64(val),
|
||||||
@@ -141,6 +160,7 @@ impl Coerce for DateTime {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::NumericalType;
|
use super::NumericalType;
|
||||||
|
use crate::NumericalValue;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_numerical_type_code() {
|
fn test_numerical_type_code() {
|
||||||
@@ -153,4 +173,58 @@ mod tests {
|
|||||||
}
|
}
|
||||||
assert_eq!(num_numerical_type, 3);
|
assert_eq!(num_numerical_type, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_numerical() {
|
||||||
|
assert_eq!(
|
||||||
|
"123".parse::<NumericalValue>().unwrap(),
|
||||||
|
NumericalValue::I64(123)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"18446744073709551615".parse::<NumericalValue>().unwrap(),
|
||||||
|
NumericalValue::U64(18446744073709551615u64)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"1.0".parse::<NumericalValue>().unwrap(),
|
||||||
|
NumericalValue::I64(1i64)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"1.1".parse::<NumericalValue>().unwrap(),
|
||||||
|
NumericalValue::F64(1.1f64)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"-1.0".parse::<NumericalValue>().unwrap(),
|
||||||
|
NumericalValue::I64(-1i64)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_numerical() {
|
||||||
|
assert_eq!(
|
||||||
|
NumericalValue::from(1u64).normalize(),
|
||||||
|
NumericalValue::I64(1i64),
|
||||||
|
);
|
||||||
|
let limit_val = i64::MAX as u64 + 1u64;
|
||||||
|
assert_eq!(
|
||||||
|
NumericalValue::from(limit_val).normalize(),
|
||||||
|
NumericalValue::U64(limit_val),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
NumericalValue::from(-1i64).normalize(),
|
||||||
|
NumericalValue::I64(-1i64),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
NumericalValue::from(-2.0f64).normalize(),
|
||||||
|
NumericalValue::I64(-2i64),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
NumericalValue::from(-2.1f64).normalize(),
|
||||||
|
NumericalValue::F64(-2.1f64),
|
||||||
|
);
|
||||||
|
let large_float = 2.0f64.powf(70.0f64);
|
||||||
|
assert_eq!(
|
||||||
|
NumericalValue::from(large_float).normalize(),
|
||||||
|
NumericalValue::F64(large_float),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use columnar::NumericalValue;
|
||||||
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||||
use common::{replace_in_place, JsonPathWriter};
|
use common::{replace_in_place, JsonPathWriter};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
@@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
if let Ok(i64_val) = val.try_into() {
|
if let Ok(i64_val) = val.try_into() {
|
||||||
term_buffer.append_type_and_fast_value::<i64>(i64_val);
|
term_buffer.append_type_and_fast_value::<i64>(i64_val);
|
||||||
} else {
|
} else {
|
||||||
term_buffer.append_type_and_fast_value(val);
|
term_buffer.append_type_and_fast_value::<u64>(val);
|
||||||
}
|
}
|
||||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||||
}
|
}
|
||||||
@@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||||
}
|
}
|
||||||
ReferenceValueLeaf::F64(val) => {
|
ReferenceValueLeaf::F64(val) => {
|
||||||
|
if !val.is_finite() {
|
||||||
|
return;
|
||||||
|
};
|
||||||
set_path_id(
|
set_path_id(
|
||||||
term_buffer,
|
term_buffer,
|
||||||
ctx.path_to_unordered_id
|
ctx.path_to_unordered_id
|
||||||
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
||||||
);
|
);
|
||||||
term_buffer.append_type_and_fast_value(val);
|
// Normalize here is important.
|
||||||
|
// In the inverted index, we coerce all numerical values to their canonical
|
||||||
|
// representation.
|
||||||
|
//
|
||||||
|
// (We do the same thing on the query side)
|
||||||
|
match NumericalValue::F64(val).normalize() {
|
||||||
|
NumericalValue::I64(val_i64) => {
|
||||||
|
term_buffer.append_type_and_fast_value::<i64>(val_i64);
|
||||||
|
}
|
||||||
|
NumericalValue::U64(val_u64) => {
|
||||||
|
term_buffer.append_type_and_fast_value::<u64>(val_u64);
|
||||||
|
}
|
||||||
|
NumericalValue::F64(val_f64) => {
|
||||||
|
term_buffer.append_type_and_fast_value::<f64>(val_f64);
|
||||||
|
}
|
||||||
|
}
|
||||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||||
}
|
}
|
||||||
ReferenceValueLeaf::Bool(val) => {
|
ReferenceValueLeaf::Bool(val) => {
|
||||||
@@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
///
|
///
|
||||||
/// The term must be json + JSON path.
|
/// The term must be json + JSON path.
|
||||||
pub fn convert_to_fast_value_and_append_to_json_term(
|
pub fn convert_to_fast_value_and_append_to_json_term(
|
||||||
mut term: Term,
|
term: &Term,
|
||||||
phrase: &str,
|
text: &str,
|
||||||
truncate_date_for_search: bool,
|
truncate_date_for_search: bool,
|
||||||
) -> Option<Term> {
|
) -> Option<Term> {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term(
|
|||||||
0,
|
0,
|
||||||
"JSON value bytes should be empty"
|
"JSON value bytes should be empty"
|
||||||
);
|
);
|
||||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
|
||||||
|
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
|
||||||
|
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_convert_to_datetime_and_append_to_json_term(
|
||||||
|
term: &Term,
|
||||||
|
text: &str,
|
||||||
|
truncate_date_for_search: bool,
|
||||||
|
) -> Option<Term> {
|
||||||
|
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
|
||||||
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||||
if truncate_date_for_search {
|
if truncate_date_for_search {
|
||||||
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||||
}
|
}
|
||||||
term.append_type_and_fast_value(dt);
|
let mut term_clone = term.clone();
|
||||||
return Some(term);
|
term_clone.append_type_and_fast_value(dt);
|
||||||
|
Some(term_clone)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
|
||||||
|
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
|
||||||
|
let mut term_clone = term.clone();
|
||||||
|
// Parse is actually returning normalized values already today, but let's not
|
||||||
|
// not rely on that hidden contract.
|
||||||
|
match numerical_value.normalize() {
|
||||||
|
NumericalValue::I64(i64_value) => {
|
||||||
|
term_clone.append_type_and_fast_value::<i64>(i64_value);
|
||||||
}
|
}
|
||||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
NumericalValue::U64(u64_value) => {
|
||||||
term.append_type_and_fast_value(i64_val);
|
term_clone.append_type_and_fast_value::<u64>(u64_value);
|
||||||
return Some(term);
|
|
||||||
}
|
}
|
||||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
NumericalValue::F64(f64_value) => {
|
||||||
term.append_type_and_fast_value(u64_val);
|
term_clone.append_type_and_fast_value::<f64>(f64_value);
|
||||||
return Some(term);
|
|
||||||
}
|
}
|
||||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
|
||||||
term.append_type_and_fast_value(f64_val);
|
|
||||||
return Some(term);
|
|
||||||
}
|
}
|
||||||
if let Ok(bool_val) = str::parse::<bool>(phrase) {
|
Some(term_clone)
|
||||||
term.append_type_and_fast_value(bool_val);
|
}
|
||||||
return Some(term);
|
|
||||||
}
|
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
|
||||||
None
|
let val = str::parse::<bool>(text).ok()?;
|
||||||
|
let mut term_clone = term.clone();
|
||||||
|
term_clone.append_type_and_fast_value(val);
|
||||||
|
Some(term_clone)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Splits a json path supplied to the query parser in such a way that
|
/// Splits a json path supplied to the query parser in such a way that
|
||||||
|
|||||||
49
src/lib.rs
49
src/lib.rs
@@ -370,6 +370,8 @@ macro_rules! fail_point {
|
|||||||
/// Common test utilities.
|
/// Common test utilities.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use common::{BinarySerializable, FixedSize};
|
use common::{BinarySerializable, FixedSize};
|
||||||
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
|
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
|
||||||
use rand::distributions::{Bernoulli, Uniform};
|
use rand::distributions::{Bernoulli, Uniform};
|
||||||
@@ -382,7 +384,7 @@ pub mod tests {
|
|||||||
use crate::index::SegmentReader;
|
use crate::index::SegmentReader;
|
||||||
use crate::merge_policy::NoMergePolicy;
|
use crate::merge_policy::NoMergePolicy;
|
||||||
use crate::postings::Postings;
|
use crate::postings::Postings;
|
||||||
use crate::query::BooleanQuery;
|
use crate::query::{BooleanQuery, QueryParser};
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
|
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
|
||||||
|
|
||||||
@@ -1223,4 +1225,49 @@ pub mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
|
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_json_number_ambiguity() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
{
|
||||||
|
let mut doc = TantivyDocument::new();
|
||||||
|
let mut obj = BTreeMap::default();
|
||||||
|
obj.insert("key".to_string(), OwnedValue::I64(1i64));
|
||||||
|
doc.add_object(json_field, obj);
|
||||||
|
index_writer.add_document(doc).unwrap();
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut doc = TantivyDocument::new();
|
||||||
|
let mut obj = BTreeMap::default();
|
||||||
|
obj.insert("key".to_string(), OwnedValue::U64(1u64));
|
||||||
|
doc.add_object(json_field, obj);
|
||||||
|
index_writer.add_document(doc).unwrap();
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut doc = TantivyDocument::new();
|
||||||
|
let mut obj = BTreeMap::default();
|
||||||
|
obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
|
||||||
|
doc.add_object(json_field, obj);
|
||||||
|
index_writer.add_document(doc).unwrap();
|
||||||
|
}
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
let searcher = index.reader().unwrap().searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
|
{
|
||||||
|
let parser = QueryParser::for_index(&index, vec![]);
|
||||||
|
let query = parser.parse_query("number.key:1").unwrap();
|
||||||
|
let count = searcher.search(&query, &crate::collector::Count).unwrap();
|
||||||
|
assert_eq!(count, 3);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let parser = QueryParser::for_index(&index, vec![]);
|
||||||
|
let query = parser.parse_query("number.key:1.0").unwrap();
|
||||||
|
let count = searcher.search(&query, &crate::collector::Count).unwrap();
|
||||||
|
assert_eq!(count, 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -495,24 +495,17 @@ impl QueryParser {
|
|||||||
Ok(terms.into_iter().next().unwrap())
|
Ok(terms.into_iter().next().unwrap())
|
||||||
}
|
}
|
||||||
FieldType::JsonObject(ref json_options) => {
|
FieldType::JsonObject(ref json_options) => {
|
||||||
let get_term_with_path = || {
|
let mut term = Term::from_field_json_path(
|
||||||
Term::from_field_json_path(
|
|
||||||
field,
|
field,
|
||||||
json_path,
|
json_path,
|
||||||
json_options.is_expand_dots_enabled(),
|
json_options.is_expand_dots_enabled(),
|
||||||
)
|
);
|
||||||
};
|
|
||||||
if let Some(term) =
|
if let Some(term) =
|
||||||
// Try to convert the phrase to a fast value
|
// Try to convert the phrase to a fast value
|
||||||
convert_to_fast_value_and_append_to_json_term(
|
convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
|
||||||
get_term_with_path(),
|
|
||||||
phrase,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
{
|
{
|
||||||
Ok(term)
|
Ok(term)
|
||||||
} else {
|
} else {
|
||||||
let mut term = get_term_with_path();
|
|
||||||
term.append_type_and_str(phrase);
|
term.append_type_and_str(phrase);
|
||||||
Ok(term)
|
Ok(term)
|
||||||
}
|
}
|
||||||
@@ -1028,7 +1021,7 @@ fn generate_literals_for_json_object(
|
|||||||
|
|
||||||
// Try to convert the phrase to a fast value
|
// Try to convert the phrase to a fast value
|
||||||
if let Some(term) =
|
if let Some(term) =
|
||||||
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
|
convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
|
||||||
{
|
{
|
||||||
logical_literals.push(LogicalLiteral::Term(term));
|
logical_literals.push(LogicalLiteral::Term(term));
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user