mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-05 16:52:55 +00:00
add coerce option for text and numbers types (#1904)
* add coerce option for text and numbers types allow to coerce the field type when indexing if the type does not match * Apply suggestions from code review Co-authored-by: Paul Masurel <paul@quickwit.io> * add tests,add COERCE flag, include bool in coercion --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
@@ -329,16 +329,66 @@ impl FieldType {
|
||||
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
FieldType::U64(opt) => {
|
||||
if opt.should_coerce() {
|
||||
Ok(Value::U64(field_text.parse().map_err(|_| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "a u64 or a u64 as string",
|
||||
json: JsonValue::String(field_text),
|
||||
}
|
||||
})?))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a u64",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::I64(opt) => {
|
||||
if opt.should_coerce() {
|
||||
Ok(Value::I64(field_text.parse().map_err(|_| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "a i64 or a i64 as string",
|
||||
json: JsonValue::String(field_text),
|
||||
}
|
||||
})?))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a i64",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::F64(opt) => {
|
||||
if opt.should_coerce() {
|
||||
Ok(Value::F64(field_text.parse().map_err(|_| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "a f64 or a f64 as string",
|
||||
json: JsonValue::String(field_text),
|
||||
}
|
||||
})?))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a f64",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::Bool(opt) => {
|
||||
if opt.should_coerce() {
|
||||
Ok(Value::Bool(field_text.parse().map_err(|_| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "a i64 or a bool as string",
|
||||
json: JsonValue::String(field_text),
|
||||
}
|
||||
})?))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a boolean",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::Bool(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a boolean",
|
||||
json: JsonValue::String(field_text),
|
||||
}),
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
|
||||
FieldType::Bytes(_) => BASE64
|
||||
.decode(&field_text)
|
||||
@@ -395,12 +445,20 @@ impl FieldType {
|
||||
expected: "a boolean",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
}),
|
||||
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
FieldType::Str(opt) => {
|
||||
if opt.should_coerce() {
|
||||
Ok(Value::Str(field_val_num.to_string()))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::Facet(_) | FieldType::Bytes(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
}),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
@@ -431,11 +489,38 @@ impl FieldType {
|
||||
},
|
||||
JsonValue::Bool(json_bool_val) => match self {
|
||||
FieldType::Bool(_) => Ok(Value::Bool(json_bool_val)),
|
||||
FieldType::Str(opt) => {
|
||||
if opt.should_coerce() {
|
||||
Ok(Value::Str(json_bool_val.to_string()))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: JsonValue::Bool(json_bool_val),
|
||||
})
|
||||
}
|
||||
}
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: JsonValue::Bool(json_bool_val),
|
||||
}),
|
||||
},
|
||||
// Could also just filter them
|
||||
JsonValue::Null => match self {
|
||||
FieldType::Str(opt) => {
|
||||
if opt.should_coerce() {
|
||||
Ok(Value::Str("null".to_string()))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: JsonValue::Null,
|
||||
})
|
||||
}
|
||||
}
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: JsonValue::Null,
|
||||
}),
|
||||
},
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: json.clone(),
|
||||
@@ -450,11 +535,90 @@ mod tests {
|
||||
|
||||
use super::FieldType;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::{Schema, TextOptions, Type, Value, INDEXED};
|
||||
use crate::schema::{NumericOptions, Schema, TextOptions, Type, Value, COERCE, INDEXED};
|
||||
use crate::time::{Date, Month, PrimitiveDateTime, Time};
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{DateTime, Document};
|
||||
|
||||
#[test]
|
||||
fn test_to_string_coercion() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("id", COERCE);
|
||||
let schema = schema_builder.build();
|
||||
let doc = schema.parse_document(r#"{"id": 100}"#).unwrap();
|
||||
assert_eq!(
|
||||
&Value::Str("100".to_string()),
|
||||
doc.get_first(text_field).unwrap()
|
||||
);
|
||||
|
||||
let doc = schema.parse_document(r#"{"id": true}"#).unwrap();
|
||||
assert_eq!(
|
||||
&Value::Str("true".to_string()),
|
||||
doc.get_first(text_field).unwrap()
|
||||
);
|
||||
|
||||
// Not sure if this null coercion is the best approach
|
||||
let doc = schema.parse_document(r#"{"id": null}"#).unwrap();
|
||||
assert_eq!(
|
||||
&Value::Str("null".to_string()),
|
||||
doc.get_first(text_field).unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_number_coercion() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let i64_field = schema_builder.add_i64_field("i64", COERCE);
|
||||
let u64_field = schema_builder.add_u64_field("u64", COERCE);
|
||||
let f64_field = schema_builder.add_f64_field("f64", COERCE);
|
||||
let schema = schema_builder.build();
|
||||
let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
assert_eq!(&Value::I64(100), doc.get_first(i64_field).unwrap());
|
||||
assert_eq!(&Value::U64(100), doc.get_first(u64_field).unwrap());
|
||||
assert_eq!(&Value::F64(100.0), doc.get_first(f64_field).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_bool_coercion() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let bool_field = schema_builder.add_bool_field("bool", COERCE);
|
||||
let schema = schema_builder.build();
|
||||
let doc_json = r#"{"bool": "true"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
assert_eq!(&Value::Bool(true), doc.get_first(bool_field).unwrap());
|
||||
|
||||
let doc_json = r#"{"bool": "false"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
assert_eq!(&Value::Bool(false), doc.get_first(bool_field).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_number_no_coercion() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_i64_field("i64", NumericOptions::default());
|
||||
schema_builder.add_u64_field("u64", NumericOptions::default());
|
||||
schema_builder.add_f64_field("f64", NumericOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
assert!(schema
|
||||
.parse_document(r#"{"u64": "100"}"#)
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("a u64"));
|
||||
|
||||
assert!(schema
|
||||
.parse_document(r#"{"i64": "100"}"#)
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("a i64"));
|
||||
|
||||
assert!(schema
|
||||
.parse_document(r#"{"f64": "100"}"#)
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("a f64"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_json_date() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -31,6 +31,18 @@ pub const INDEXED: SchemaFlagList<IndexedFlag, ()> = SchemaFlagList {
|
||||
tail: (),
|
||||
};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct CoerceFlag;
|
||||
/// Flag to mark the field as coerced.
|
||||
///
|
||||
/// `COERCE` will try to convert values into its value type if they don't match.
|
||||
///
|
||||
/// See [fast fields](`crate::fastfield`).
|
||||
pub const COERCE: SchemaFlagList<CoerceFlag, ()> = SchemaFlagList {
|
||||
head: CoerceFlag,
|
||||
tail: (),
|
||||
};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FastFlag;
|
||||
/// Flag to mark the field as a fast field (similar to Lucene's DocValues)
|
||||
|
||||
@@ -39,6 +39,7 @@ pub struct JsonObjectOptions {
|
||||
/// `{"root": {"child": {"with": {"dot": "hello"}}}}`
|
||||
/// and it can be search using the following query:
|
||||
/// `root.child.with.dot:hello`
|
||||
#[serde(default)]
|
||||
expand_dots_enabled: bool,
|
||||
}
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ pub use self::field::Field;
|
||||
pub use self::field_entry::FieldEntry;
|
||||
pub use self::field_type::{FieldType, Type};
|
||||
pub use self::field_value::FieldValue;
|
||||
pub use self::flags::{FAST, INDEXED, STORED};
|
||||
pub use self::flags::{COERCE, FAST, INDEXED, STORED};
|
||||
pub use self::index_record_option::IndexRecordOption;
|
||||
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};
|
||||
pub use self::json_object_options::JsonObjectOptions;
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::flags::CoerceFlag;
|
||||
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
|
||||
#[deprecated(since = "0.17.0", note = "Use NumericOptions instead.")]
|
||||
@@ -17,6 +18,12 @@ pub struct NumericOptions {
|
||||
fieldnorms: bool, // This attribute only has an effect if indexed is true.
|
||||
fast: bool,
|
||||
stored: bool,
|
||||
#[serde(skip_serializing_if = "is_false")]
|
||||
coerce: bool,
|
||||
}
|
||||
|
||||
fn is_false(val: &bool) -> bool {
|
||||
!val
|
||||
}
|
||||
|
||||
/// For backward compatibility we add an intermediary to interpret the
|
||||
@@ -32,6 +39,8 @@ struct NumericOptionsDeser {
|
||||
#[serde(default)]
|
||||
fast: bool,
|
||||
stored: bool,
|
||||
#[serde(default)]
|
||||
coerce: bool,
|
||||
}
|
||||
|
||||
impl From<NumericOptionsDeser> for NumericOptions {
|
||||
@@ -41,6 +50,7 @@ impl From<NumericOptionsDeser> for NumericOptions {
|
||||
fieldnorms: deser.fieldnorms.unwrap_or(deser.indexed),
|
||||
fast: deser.fast,
|
||||
stored: deser.stored,
|
||||
coerce: deser.coerce,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -66,6 +76,18 @@ impl NumericOptions {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Returns true if values should be coerced to numbers.
|
||||
pub fn should_coerce(&self) -> bool {
|
||||
self.coerce
|
||||
}
|
||||
|
||||
/// Try to coerce values if they are not a number. Defaults to false.
|
||||
#[must_use]
|
||||
pub fn set_coerce(mut self) -> Self {
|
||||
self.coerce = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the field as stored.
|
||||
///
|
||||
/// Only the fields that are set as *stored* are
|
||||
@@ -117,6 +139,18 @@ impl From<()> for NumericOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CoerceFlag> for NumericOptions {
|
||||
fn from(_: CoerceFlag) -> NumericOptions {
|
||||
NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: false,
|
||||
fast: false,
|
||||
coerce: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFlag> for NumericOptions {
|
||||
fn from(_: FastFlag) -> Self {
|
||||
NumericOptions {
|
||||
@@ -124,6 +158,7 @@ impl From<FastFlag> for NumericOptions {
|
||||
fieldnorms: false,
|
||||
stored: false,
|
||||
fast: true,
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -135,6 +170,7 @@ impl From<StoredFlag> for NumericOptions {
|
||||
fieldnorms: false,
|
||||
stored: true,
|
||||
fast: false,
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -146,6 +182,7 @@ impl From<IndexedFlag> for NumericOptions {
|
||||
fieldnorms: true,
|
||||
stored: false,
|
||||
fast: false,
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -160,6 +197,7 @@ impl<T: Into<NumericOptions>> BitOr<T> for NumericOptions {
|
||||
fieldnorms: self.fieldnorms | other.fieldnorms,
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
coerce: self.coerce | other.coerce,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -192,7 +230,8 @@ mod tests {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
fast: false,
|
||||
stored: false
|
||||
stored: false,
|
||||
coerce: false,
|
||||
}
|
||||
);
|
||||
}
|
||||
@@ -210,7 +249,8 @@ mod tests {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
fast: false,
|
||||
stored: false
|
||||
stored: false,
|
||||
coerce: false,
|
||||
}
|
||||
);
|
||||
}
|
||||
@@ -229,7 +269,8 @@ mod tests {
|
||||
indexed: true,
|
||||
fieldnorms: false,
|
||||
fast: false,
|
||||
stored: false
|
||||
stored: false,
|
||||
coerce: false,
|
||||
}
|
||||
);
|
||||
}
|
||||
@@ -249,7 +290,30 @@ mod tests {
|
||||
indexed: false,
|
||||
fieldnorms: true,
|
||||
fast: false,
|
||||
stored: false
|
||||
stored: false,
|
||||
coerce: false,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_int_options_deser_if_coerce_true() {
|
||||
// this one is kind of useless, at least at the moment
|
||||
let json = r#"{
|
||||
"indexed": false,
|
||||
"fieldnorms": true,
|
||||
"stored": false,
|
||||
"coerce": true
|
||||
}"#;
|
||||
let int_options: NumericOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&NumericOptions {
|
||||
indexed: false,
|
||||
fieldnorms: true,
|
||||
fast: false,
|
||||
stored: false,
|
||||
coerce: true,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::flags::FastFlag;
|
||||
use super::flags::{CoerceFlag, FastFlag};
|
||||
use crate::schema::flags::{SchemaFlagList, StoredFlag};
|
||||
use crate::schema::IndexRecordOption;
|
||||
|
||||
@@ -17,6 +17,14 @@ pub struct TextOptions {
|
||||
stored: bool,
|
||||
#[serde(default)]
|
||||
fast: bool,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "is_false")]
|
||||
/// coerce values if they are not of type string
|
||||
coerce: bool,
|
||||
}
|
||||
|
||||
fn is_false(val: &bool) -> bool {
|
||||
!val
|
||||
}
|
||||
|
||||
impl TextOptions {
|
||||
@@ -35,6 +43,11 @@ impl TextOptions {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Returns true if values should be coerced to strings (numbers, null).
|
||||
pub fn should_coerce(&self) -> bool {
|
||||
self.coerce
|
||||
}
|
||||
|
||||
/// Set the field as a fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
@@ -56,7 +69,14 @@ impl TextOptions {
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the field as stored
|
||||
/// Coerce values if they are not of type string. Defaults to false.
|
||||
#[must_use]
|
||||
pub fn set_coerce(mut self) -> TextOptions {
|
||||
self.coerce = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the field as stored.
|
||||
#[must_use]
|
||||
pub fn set_stored(mut self) -> TextOptions {
|
||||
self.stored = true;
|
||||
@@ -180,6 +200,7 @@ pub const STRING: TextOptions = TextOptions {
|
||||
}),
|
||||
stored: false,
|
||||
fast: false,
|
||||
coerce: false,
|
||||
};
|
||||
|
||||
/// The field will be tokenized and indexed.
|
||||
@@ -190,6 +211,7 @@ pub const TEXT: TextOptions = TextOptions {
|
||||
record: IndexRecordOption::WithFreqsAndPositions,
|
||||
}),
|
||||
stored: false,
|
||||
coerce: false,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
@@ -202,6 +224,7 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
indexing: self.indexing.or(other.indexing),
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
coerce: self.coerce | other.coerce,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -218,6 +241,18 @@ impl From<StoredFlag> for TextOptions {
|
||||
indexing: None,
|
||||
stored: true,
|
||||
fast: false,
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CoerceFlag> for TextOptions {
|
||||
fn from(_: CoerceFlag) -> TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: false,
|
||||
coerce: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -228,6 +263,7 @@ impl From<FastFlag> for TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: true,
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user