add coerce option for text and numbers types (#1904)

* add coerce option for text and numbers types

allow to coerce the field type when indexing if the type does not match

* Apply suggestions from code review

Co-authored-by: Paul Masurel <paul@quickwit.io>

* add tests,add COERCE flag, include bool in coercion

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
PSeitz
2023-03-01 18:36:59 +08:00
committed by GitHub
parent 850a0d7ae2
commit faa706d804
6 changed files with 299 additions and 22 deletions

View File

@@ -329,16 +329,66 @@ impl FieldType {
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
}
FieldType::Str(_) => Ok(Value::Str(field_text)),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
Err(ValueParsingError::TypeError {
expected: "an integer",
json: JsonValue::String(field_text),
})
FieldType::U64(opt) => {
if opt.should_coerce() {
Ok(Value::U64(field_text.parse().map_err(|_| {
ValueParsingError::TypeError {
expected: "a u64 or a u64 as string",
json: JsonValue::String(field_text),
}
})?))
} else {
Err(ValueParsingError::TypeError {
expected: "a u64",
json: JsonValue::String(field_text),
})
}
}
FieldType::I64(opt) => {
if opt.should_coerce() {
Ok(Value::I64(field_text.parse().map_err(|_| {
ValueParsingError::TypeError {
expected: "a i64 or a i64 as string",
json: JsonValue::String(field_text),
}
})?))
} else {
Err(ValueParsingError::TypeError {
expected: "a i64",
json: JsonValue::String(field_text),
})
}
}
FieldType::F64(opt) => {
if opt.should_coerce() {
Ok(Value::F64(field_text.parse().map_err(|_| {
ValueParsingError::TypeError {
expected: "a f64 or a f64 as string",
json: JsonValue::String(field_text),
}
})?))
} else {
Err(ValueParsingError::TypeError {
expected: "a f64",
json: JsonValue::String(field_text),
})
}
}
FieldType::Bool(opt) => {
if opt.should_coerce() {
Ok(Value::Bool(field_text.parse().map_err(|_| {
ValueParsingError::TypeError {
expected: "a i64 or a bool as string",
json: JsonValue::String(field_text),
}
})?))
} else {
Err(ValueParsingError::TypeError {
expected: "a boolean",
json: JsonValue::String(field_text),
})
}
}
FieldType::Bool(_) => Err(ValueParsingError::TypeError {
expected: "a boolean",
json: JsonValue::String(field_text),
}),
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
FieldType::Bytes(_) => BASE64
.decode(&field_text)
@@ -395,12 +445,20 @@ impl FieldType {
expected: "a boolean",
json: JsonValue::Number(field_val_num),
}),
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
Err(ValueParsingError::TypeError {
expected: "a string",
json: JsonValue::Number(field_val_num),
})
FieldType::Str(opt) => {
if opt.should_coerce() {
Ok(Value::Str(field_val_num.to_string()))
} else {
Err(ValueParsingError::TypeError {
expected: "a string",
json: JsonValue::Number(field_val_num),
})
}
}
FieldType::Facet(_) | FieldType::Bytes(_) => Err(ValueParsingError::TypeError {
expected: "a string",
json: JsonValue::Number(field_val_num),
}),
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
expected: "a json object",
json: JsonValue::Number(field_val_num),
@@ -431,11 +489,38 @@ impl FieldType {
},
JsonValue::Bool(json_bool_val) => match self {
FieldType::Bool(_) => Ok(Value::Bool(json_bool_val)),
FieldType::Str(opt) => {
if opt.should_coerce() {
Ok(Value::Str(json_bool_val.to_string()))
} else {
Err(ValueParsingError::TypeError {
expected: "a string",
json: JsonValue::Bool(json_bool_val),
})
}
}
_ => Err(ValueParsingError::TypeError {
expected: self.value_type().name(),
json: JsonValue::Bool(json_bool_val),
}),
},
// Could also just filter them
JsonValue::Null => match self {
FieldType::Str(opt) => {
if opt.should_coerce() {
Ok(Value::Str("null".to_string()))
} else {
Err(ValueParsingError::TypeError {
expected: "a string",
json: JsonValue::Null,
})
}
}
_ => Err(ValueParsingError::TypeError {
expected: self.value_type().name(),
json: JsonValue::Null,
}),
},
_ => Err(ValueParsingError::TypeError {
expected: self.value_type().name(),
json: json.clone(),
@@ -450,11 +535,90 @@ mod tests {
use super::FieldType;
use crate::schema::field_type::ValueParsingError;
use crate::schema::{Schema, TextOptions, Type, Value, INDEXED};
use crate::schema::{NumericOptions, Schema, TextOptions, Type, Value, COERCE, INDEXED};
use crate::time::{Date, Month, PrimitiveDateTime, Time};
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{DateTime, Document};
#[test]
fn test_to_string_coercion() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("id", COERCE);
let schema = schema_builder.build();
let doc = schema.parse_document(r#"{"id": 100}"#).unwrap();
assert_eq!(
&Value::Str("100".to_string()),
doc.get_first(text_field).unwrap()
);
let doc = schema.parse_document(r#"{"id": true}"#).unwrap();
assert_eq!(
&Value::Str("true".to_string()),
doc.get_first(text_field).unwrap()
);
// Not sure if this null coercion is the best approach
let doc = schema.parse_document(r#"{"id": null}"#).unwrap();
assert_eq!(
&Value::Str("null".to_string()),
doc.get_first(text_field).unwrap()
);
}
#[test]
fn test_to_number_coercion() {
let mut schema_builder = Schema::builder();
let i64_field = schema_builder.add_i64_field("i64", COERCE);
let u64_field = schema_builder.add_u64_field("u64", COERCE);
let f64_field = schema_builder.add_f64_field("f64", COERCE);
let schema = schema_builder.build();
let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#;
let doc = schema.parse_document(doc_json).unwrap();
assert_eq!(&Value::I64(100), doc.get_first(i64_field).unwrap());
assert_eq!(&Value::U64(100), doc.get_first(u64_field).unwrap());
assert_eq!(&Value::F64(100.0), doc.get_first(f64_field).unwrap());
}
#[test]
fn test_to_bool_coercion() {
let mut schema_builder = Schema::builder();
let bool_field = schema_builder.add_bool_field("bool", COERCE);
let schema = schema_builder.build();
let doc_json = r#"{"bool": "true"}"#;
let doc = schema.parse_document(doc_json).unwrap();
assert_eq!(&Value::Bool(true), doc.get_first(bool_field).unwrap());
let doc_json = r#"{"bool": "false"}"#;
let doc = schema.parse_document(doc_json).unwrap();
assert_eq!(&Value::Bool(false), doc.get_first(bool_field).unwrap());
}
#[test]
fn test_to_number_no_coercion() {
let mut schema_builder = Schema::builder();
schema_builder.add_i64_field("i64", NumericOptions::default());
schema_builder.add_u64_field("u64", NumericOptions::default());
schema_builder.add_f64_field("f64", NumericOptions::default());
let schema = schema_builder.build();
assert!(schema
.parse_document(r#"{"u64": "100"}"#)
.unwrap_err()
.to_string()
.contains("a u64"));
assert!(schema
.parse_document(r#"{"i64": "100"}"#)
.unwrap_err()
.to_string()
.contains("a i64"));
assert!(schema
.parse_document(r#"{"f64": "100"}"#)
.unwrap_err()
.to_string()
.contains("a f64"));
}
#[test]
fn test_deserialize_json_date() {
let mut schema_builder = Schema::builder();

View File

@@ -31,6 +31,18 @@ pub const INDEXED: SchemaFlagList<IndexedFlag, ()> = SchemaFlagList {
tail: (),
};
#[derive(Clone)]
pub struct CoerceFlag;
/// Flag to mark the field as coerced.
///
/// `COERCE` will try to convert values into its value type if they don't match.
///
/// See [fast fields](`crate::fastfield`).
pub const COERCE: SchemaFlagList<CoerceFlag, ()> = SchemaFlagList {
head: CoerceFlag,
tail: (),
};
#[derive(Clone)]
pub struct FastFlag;
/// Flag to mark the field as a fast field (similar to Lucene's DocValues)

View File

@@ -39,6 +39,7 @@ pub struct JsonObjectOptions {
/// `{"root": {"child": {"with": {"dot": "hello"}}}}`
/// and it can be search using the following query:
/// `root.child.with.dot:hello`
#[serde(default)]
expand_dots_enabled: bool,
}

View File

@@ -138,7 +138,7 @@ pub use self::field::Field;
pub use self::field_entry::FieldEntry;
pub use self::field_type::{FieldType, Type};
pub use self::field_value::FieldValue;
pub use self::flags::{FAST, INDEXED, STORED};
pub use self::flags::{COERCE, FAST, INDEXED, STORED};
pub use self::index_record_option::IndexRecordOption;
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};
pub use self::json_object_options::JsonObjectOptions;

View File

@@ -2,6 +2,7 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::flags::CoerceFlag;
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
#[deprecated(since = "0.17.0", note = "Use NumericOptions instead.")]
@@ -17,6 +18,12 @@ pub struct NumericOptions {
fieldnorms: bool, // This attribute only has an effect if indexed is true.
fast: bool,
stored: bool,
#[serde(skip_serializing_if = "is_false")]
coerce: bool,
}
fn is_false(val: &bool) -> bool {
!val
}
/// For backward compatibility we add an intermediary to interpret the
@@ -32,6 +39,8 @@ struct NumericOptionsDeser {
#[serde(default)]
fast: bool,
stored: bool,
#[serde(default)]
coerce: bool,
}
impl From<NumericOptionsDeser> for NumericOptions {
@@ -41,6 +50,7 @@ impl From<NumericOptionsDeser> for NumericOptions {
fieldnorms: deser.fieldnorms.unwrap_or(deser.indexed),
fast: deser.fast,
stored: deser.stored,
coerce: deser.coerce,
}
}
}
@@ -66,6 +76,18 @@ impl NumericOptions {
self.fast
}
/// Returns true if values should be coerced to numbers.
pub fn should_coerce(&self) -> bool {
self.coerce
}
/// Try to coerce values if they are not a number. Defaults to false.
#[must_use]
pub fn set_coerce(mut self) -> Self {
self.coerce = true;
self
}
/// Set the field as stored.
///
/// Only the fields that are set as *stored* are
@@ -117,6 +139,18 @@ impl From<()> for NumericOptions {
}
}
impl From<CoerceFlag> for NumericOptions {
fn from(_: CoerceFlag) -> NumericOptions {
NumericOptions {
indexed: false,
fieldnorms: false,
stored: false,
fast: false,
coerce: true,
}
}
}
impl From<FastFlag> for NumericOptions {
fn from(_: FastFlag) -> Self {
NumericOptions {
@@ -124,6 +158,7 @@ impl From<FastFlag> for NumericOptions {
fieldnorms: false,
stored: false,
fast: true,
coerce: false,
}
}
}
@@ -135,6 +170,7 @@ impl From<StoredFlag> for NumericOptions {
fieldnorms: false,
stored: true,
fast: false,
coerce: false,
}
}
}
@@ -146,6 +182,7 @@ impl From<IndexedFlag> for NumericOptions {
fieldnorms: true,
stored: false,
fast: false,
coerce: false,
}
}
}
@@ -160,6 +197,7 @@ impl<T: Into<NumericOptions>> BitOr<T> for NumericOptions {
fieldnorms: self.fieldnorms | other.fieldnorms,
stored: self.stored | other.stored,
fast: self.fast | other.fast,
coerce: self.coerce | other.coerce,
}
}
}
@@ -192,7 +230,8 @@ mod tests {
indexed: true,
fieldnorms: true,
fast: false,
stored: false
stored: false,
coerce: false,
}
);
}
@@ -210,7 +249,8 @@ mod tests {
indexed: false,
fieldnorms: false,
fast: false,
stored: false
stored: false,
coerce: false,
}
);
}
@@ -229,7 +269,8 @@ mod tests {
indexed: true,
fieldnorms: false,
fast: false,
stored: false
stored: false,
coerce: false,
}
);
}
@@ -249,7 +290,30 @@ mod tests {
indexed: false,
fieldnorms: true,
fast: false,
stored: false
stored: false,
coerce: false,
}
);
}
#[test]
fn test_int_options_deser_if_coerce_true() {
// this one is kind of useless, at least at the moment
let json = r#"{
"indexed": false,
"fieldnorms": true,
"stored": false,
"coerce": true
}"#;
let int_options: NumericOptions = serde_json::from_str(json).unwrap();
assert_eq!(
&int_options,
&NumericOptions {
indexed: false,
fieldnorms: true,
fast: false,
stored: false,
coerce: true,
}
);
}

View File

@@ -3,7 +3,7 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::flags::FastFlag;
use super::flags::{CoerceFlag, FastFlag};
use crate::schema::flags::{SchemaFlagList, StoredFlag};
use crate::schema::IndexRecordOption;
@@ -17,6 +17,14 @@ pub struct TextOptions {
stored: bool,
#[serde(default)]
fast: bool,
#[serde(default)]
#[serde(skip_serializing_if = "is_false")]
/// coerce values if they are not of type string
coerce: bool,
}
fn is_false(val: &bool) -> bool {
!val
}
impl TextOptions {
@@ -35,6 +43,11 @@ impl TextOptions {
self.fast
}
/// Returns true if values should be coerced to strings (numbers, null).
pub fn should_coerce(&self) -> bool {
self.coerce
}
/// Set the field as a fast field.
///
/// Fast fields are designed for random access.
@@ -56,7 +69,14 @@ impl TextOptions {
self
}
/// Sets the field as stored
/// Coerce values if they are not of type string. Defaults to false.
#[must_use]
pub fn set_coerce(mut self) -> TextOptions {
self.coerce = true;
self
}
/// Sets the field as stored.
#[must_use]
pub fn set_stored(mut self) -> TextOptions {
self.stored = true;
@@ -180,6 +200,7 @@ pub const STRING: TextOptions = TextOptions {
}),
stored: false,
fast: false,
coerce: false,
};
/// The field will be tokenized and indexed.
@@ -190,6 +211,7 @@ pub const TEXT: TextOptions = TextOptions {
record: IndexRecordOption::WithFreqsAndPositions,
}),
stored: false,
coerce: false,
fast: false,
};
@@ -202,6 +224,7 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
indexing: self.indexing.or(other.indexing),
stored: self.stored | other.stored,
fast: self.fast | other.fast,
coerce: self.coerce | other.coerce,
}
}
}
@@ -218,6 +241,18 @@ impl From<StoredFlag> for TextOptions {
indexing: None,
stored: true,
fast: false,
coerce: false,
}
}
}
impl From<CoerceFlag> for TextOptions {
fn from(_: CoerceFlag) -> TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: false,
coerce: true,
}
}
}
@@ -228,6 +263,7 @@ impl From<FastFlag> for TextOptions {
indexing: None,
stored: false,
fast: true,
coerce: false,
}
}
}