From 43225a8eee0aae4303f969b0ef8e2c421c79b594 Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Wed, 15 Apr 2026 11:38:01 +0800 Subject: [PATCH] feat: introducing "JSON2" type (#7965) Signed-off-by: luofucong --- src/api/src/helper.rs | 2 +- src/common/sql/src/convert.rs | 2 +- src/datatypes/src/data_type.rs | 2 +- src/datatypes/src/json/value.rs | 4 +- src/datatypes/src/types/json_type.rs | 192 +++++++++++------- src/datatypes/src/vectors/helper.rs | 8 - src/datatypes/src/vectors/json/builder.rs | 191 +---------------- .../src/etl/transform/transformer/greptime.rs | 13 +- src/sql/src/statements.rs | 77 ++++--- src/sql/src/statements/create.rs | 47 +++-- tests-integration/tests/jsonbench.rs | 4 + .../stats_schema_mismatch_regression.result | 29 ++- .../stats_schema_mismatch_regression.sql | 21 +- .../common/types/json/json-structured.result | 82 -------- .../common/types/json/json-structured.sql | 28 --- .../standalone/common/types/json/json2.result | 14 ++ .../standalone/common/types/json/json2.sql | 9 + 17 files changed, 252 insertions(+), 473 deletions(-) delete mode 100644 tests/cases/standalone/common/types/json/json-structured.result delete mode 100644 tests/cases/standalone/common/types/json/json-structured.sql create mode 100644 tests/cases/standalone/common/types/json/json2.result create mode 100644 tests/cases/standalone/common/types/json/json2.sql diff --git a/src/api/src/helper.rs b/src/api/src/helper.rs index 4664c0434b..b6addf3da6 100644 --- a/src/api/src/helper.rs +++ b/src/api/src/helper.rs @@ -444,7 +444,7 @@ impl TryFrom for ColumnDataTypeWrapper { JsonFormat::Jsonb => Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())), }), - JsonFormat::Native(native_type) => { + JsonFormat::Json2(native_type) => { if native_type.is_null() { None } else { diff --git a/src/common/sql/src/convert.rs b/src/common/sql/src/convert.rs index bd9a1d0769..be9701f832 100644 --- a/src/common/sql/src/convert.rs +++ b/src/common/sql/src/convert.rs @@ -306,7 +306,7 @@ pub(crate) fn parse_string_to_value( let v = parse_string_to_jsonb(&s).context(DatatypeSnafu)?; Ok(Value::Binary(v.into())) } - JsonFormat::Native(_) => { + JsonFormat::Json2(_) => { let extension_type: Option = column_schema.extension_type().context(DatatypeSnafu)?; let json_structure_settings = extension_type diff --git a/src/datatypes/src/data_type.rs b/src/datatypes/src/data_type.rs index e39d31d3d1..cfe06b27ea 100644 --- a/src/datatypes/src/data_type.rs +++ b/src/datatypes/src/data_type.rs @@ -688,7 +688,7 @@ impl ConcreteDataType { } pub fn json_native_datatype(inner_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::Json(JsonType::new_native((&inner_type).into())) + ConcreteDataType::Json(JsonType::new_json2((&inner_type).into())) } } diff --git a/src/datatypes/src/json/value.rs b/src/datatypes/src/json/value.rs index f8cf71e936..dfc732bc9a 100644 --- a/src/datatypes/src/json/value.rs +++ b/src/datatypes/src/json/value.rs @@ -170,7 +170,7 @@ impl JsonVariant { } fn json_type(&self) -> JsonType { - JsonType::new_native(self.native_type()) + JsonType::new_json2(self.native_type()) } fn as_ref(&self) -> JsonVariantRef<'_> { @@ -526,7 +526,7 @@ impl JsonVariantRef<'_> { ), } } - JsonType::new_native(native_type(self)) + JsonType::new_json2(native_type(self)) } } diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 13aeffb26c..61d3763374 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -18,6 +18,7 @@ use std::str::FromStr; use std::sync::{Arc, LazyLock}; use arrow::datatypes::DataType as ArrowDataType; +use arrow_schema::Fields; use common_base::bytes::Bytes; use regex::{Captures, Regex}; use serde::{Deserialize, Serialize}; @@ -36,6 +37,7 @@ use crate::vectors::json::builder::JsonVectorBuilder; use crate::vectors::{BinaryVectorBuilder, MutableVector}; pub const JSON_TYPE_NAME: &str = "Json"; +const JSON2_TYPE_NAME: &str = "Json2"; const JSON_PLAIN_FIELD_NAME: &str = "__json_plain__"; const JSON_PLAIN_FIELD_METADATA_KEY: &str = "is_plain_json"; @@ -56,6 +58,10 @@ pub enum JsonNativeType { String, Array(Box), Object(JsonObjectType), + /// A special (not in the JSON official specification) JSON type to indicate the "resolved" or + /// "lifted" type of two conflicting JSON types. For example, when merging JSON types of "Bool" + /// and "Number". + Variant, } impl JsonNativeType { @@ -97,6 +103,7 @@ impl From<&JsonNativeType> for ConcreteDataType { .collect(); ConcreteDataType::Struct(StructType::new(Arc::new(fields))) } + JsonNativeType::Variant => ConcreteDataType::binary_datatype(), } } } @@ -127,6 +134,7 @@ impl From<&ConcreteDataType> for JsonNativeType { .collect(), ), ConcreteDataType::Json(json_type) => json_type.native_type().clone(), + ConcreteDataType::Binary(_) => JsonNativeType::Variant, _ => unreachable!(), } } @@ -155,6 +163,7 @@ impl Display for JsonNativeType { .join(",") ) } + JsonNativeType::Variant => write!(f, r#""""#), } } } @@ -163,7 +172,7 @@ impl Display for JsonNativeType { pub enum JsonFormat { #[default] Jsonb, - Native(Box), + Json2(Box), } /// JsonType is a data type for JSON data. It is stored as binary data of jsonb format. @@ -178,26 +187,26 @@ impl JsonType { Self { format } } - pub(crate) fn new_native(native: JsonNativeType) -> Self { + pub(crate) fn new_json2(native: JsonNativeType) -> Self { Self { - format: JsonFormat::Native(Box::new(native)), + format: JsonFormat::Json2(Box::new(native)), } } - pub fn is_native_type(&self) -> bool { - matches!(self.format, JsonFormat::Native(_)) + pub fn is_json2(&self) -> bool { + matches!(self.format, JsonFormat::Json2(_)) } - pub fn native_type(&self) -> &JsonNativeType { + pub(crate) fn native_type(&self) -> &JsonNativeType { match &self.format { JsonFormat::Jsonb => &JsonNativeType::String, - JsonFormat::Native(x) => x.as_ref(), + JsonFormat::Json2(x) => x.as_ref(), } } pub fn null() -> Self { Self { - format: JsonFormat::Native(Box::new(JsonNativeType::Null)), + format: JsonFormat::Json2(Box::new(JsonNativeType::Null)), } } @@ -208,7 +217,7 @@ impl JsonType { pub(crate) fn as_struct_type(&self) -> StructType { match &self.format { JsonFormat::Jsonb => StructType::default(), - JsonFormat::Native(inner) => match ConcreteDataType::from(inner.as_ref()) { + JsonFormat::Json2(inner) => match ConcreteDataType::from(inner.as_ref()) { ConcreteDataType::Struct(t) => t.clone(), x => plain_json_struct_type(x), }, @@ -219,9 +228,9 @@ impl JsonType { pub fn merge(&mut self, other: &JsonType) -> Result<()> { match (&self.format, &other.format) { (JsonFormat::Jsonb, JsonFormat::Jsonb) => Ok(()), - (JsonFormat::Native(this), JsonFormat::Native(that)) => { - let merged = merge(this.as_ref(), that.as_ref())?; - self.format = JsonFormat::Native(Box::new(merged)); + (JsonFormat::Json2(this), JsonFormat::Json2(that)) => { + let merged = merge(this.as_ref(), that.as_ref()); + self.format = JsonFormat::Json2(Box::new(merged)); Ok(()) } _ => MergeJsonDatatypeSnafu { @@ -232,10 +241,10 @@ impl JsonType { } /// Check if it can merge with `other` json type. - pub fn is_mergeable(&self, other: &JsonType) -> bool { + pub(crate) fn is_mergeable(&self, other: &JsonType) -> bool { match (&self.format, &other.format) { (JsonFormat::Jsonb, JsonFormat::Jsonb) => true, - (JsonFormat::Native(this), JsonFormat::Native(that)) => { + (JsonFormat::Json2(this), JsonFormat::Json2(that)) => { is_mergeable(this.as_ref(), that.as_ref()) } _ => false, @@ -246,7 +255,7 @@ impl JsonType { pub fn is_include(&self, other: &JsonType) -> bool { match (&self.format, &other.format) { (JsonFormat::Jsonb, JsonFormat::Jsonb) => true, - (JsonFormat::Native(this), JsonFormat::Native(that)) => { + (JsonFormat::Json2(this), JsonFormat::Json2(that)) => { is_include(this.as_ref(), that.as_ref()) } _ => false, @@ -313,34 +322,31 @@ fn is_mergeable(this: &JsonNativeType, that: &JsonNativeType) -> bool { } } -fn merge(this: &JsonNativeType, that: &JsonNativeType) -> Result { - fn merge_object(this: &JsonObjectType, that: &JsonObjectType) -> Result { +fn merge(this: &JsonNativeType, that: &JsonNativeType) -> JsonNativeType { + fn merge_object(this: &JsonObjectType, that: &JsonObjectType) -> JsonObjectType { let mut this = this.clone(); // merge "that" into "this" directly: for (type_name, that_type) in that { if let Some(this_type) = this.get_mut(type_name) { - let merged_type = merge(this_type, that_type)?; + let merged_type = merge(this_type, that_type); *this_type = merged_type; } else { this.insert(type_name.clone(), that_type.clone()); } } - Ok(this) + this } match (this, that) { - (this, that) if this == that => Ok(this.clone()), + (this, that) if this == that => this.clone(), (JsonNativeType::Array(this), JsonNativeType::Array(that)) => { - merge(this.as_ref(), that.as_ref()).map(|x| JsonNativeType::Array(Box::new(x))) + JsonNativeType::Array(Box::new(merge(this.as_ref(), that.as_ref()))) } (JsonNativeType::Object(this), JsonNativeType::Object(that)) => { - merge_object(this, that).map(JsonNativeType::Object) + JsonNativeType::Object(merge_object(this, that)) } - (JsonNativeType::Null, x) | (x, JsonNativeType::Null) => Ok(x.clone()), - _ => MergeJsonDatatypeSnafu { - reason: format!("datatypes have conflict, this: {this}, that: {that}"), - } - .fail(), + (JsonNativeType::Null, x) | (x, JsonNativeType::Null) => x.clone(), + _ => JsonNativeType::Variant, } } @@ -348,7 +354,7 @@ impl DataType for JsonType { fn name(&self) -> String { match &self.format { JsonFormat::Jsonb => JSON_TYPE_NAME.to_string(), - JsonFormat::Native(x) => format!("Json<{x}>"), + JsonFormat::Json2(_) => JSON2_TYPE_NAME.to_string(), } } @@ -363,14 +369,19 @@ impl DataType for JsonType { fn as_arrow_type(&self) -> ArrowDataType { match self.format { JsonFormat::Jsonb => ArrowDataType::Binary, - JsonFormat::Native(_) => self.as_struct_type().as_arrow_type(), + // "Erase" the JSON struct when converting to Arrow datatype, is a feature (not a bug). + // The actual Arrow datatype is deduced from parquet data and query schema (a process + // called "JSON type concretization") from time to time, there's no a universal/global + // type for JSON2. + // Same reason for ignoring the struct in the `name` method above. + JsonFormat::Json2(_) => ArrowDataType::Struct(Fields::empty()), } } fn create_mutable_vector(&self, capacity: usize) -> Box { match &self.format { JsonFormat::Jsonb => Box::new(BinaryVectorBuilder::with_capacity(capacity)), - JsonFormat::Native(x) => Box::new(JsonVectorBuilder::new(*x.clone(), capacity)), + JsonFormat::Json2(x) => Box::new(JsonVectorBuilder::new(*x.clone(), capacity)), } } @@ -742,68 +753,93 @@ mod tests { let result = json_type.merge(other); match (result, expected) { (Ok(()), Ok(expected)) => { - assert_eq!(json_type.name(), expected); - assert!(json_type.is_mergeable(other)); + assert_eq!(json_type.native_type().to_string(), expected); } (Err(err), Err(expected)) => { assert_eq!(err.to_string(), expected); - assert!(!json_type.is_mergeable(other)); } _ => unreachable!(), } Ok(()) } - let json_type = &mut JsonType::new_native(JsonNativeType::Null); + // Null should be absorbed by a concrete scalar type. + test("true", &mut JsonType::null(), Ok(r#""""#))?; - // can merge with json object: - let json = r#"{ - "hello": "world", - "list": [1, 2, 3], - "object": {"a": 1} - }"#; - let expected = - r#"Json<{"hello":"","list":[""],"object":{"a":""}}>"#; - test(json, json_type, Ok(expected))?; + // Merging a null value into an existing concrete type should keep the type unchanged. + test( + "null", + &mut JsonType::new_json2(JsonNativeType::Bool), + Ok(r#""""#), + )?; - // cannot merge with other non-object json values: - let jsons = [r#""s""#, "1", "[1]"]; - let expects = [ - r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"","list":[""],"object":{"a":""}}, that: """#, - r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"","list":[""],"object":{"a":""}}, that: """#, - r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"","list":[""],"object":{"a":""}}, that: [""]"#, - ]; - for (json, expect) in jsons.into_iter().zip(expects) { - test(json, json_type, Err(expect))?; - } + // Identical number categories should stay as Number. + test( + "1", + &mut JsonType::new_json2(JsonNativeType::i64()), + Ok(r#""""#), + )?; - // cannot merge with other json object with conflict field datatype: - let json = r#"{ - "hello": 1, - "float": 0.123, - "no": 42 - }"#; - let expected = r#"Failed to merge JSON datatype: datatypes have conflict, this: "", that: """#; - test(json, json_type, Err(expected))?; + // Conflicting number categories should be lifted to Variant. + test( + "1.5", + &mut JsonType::new_json2(JsonNativeType::i64()), + Ok(r#""""#), + )?; - // can merge with another json object: - let json = r#"{ - "hello": "greptime", - "float": 0.123, - "int": 42 - }"#; - let expected = r#"Json<{"float":"","hello":"","int":"","list":[""],"object":{"a":""}}>"#; - test(json, json_type, Ok(expected))?; + // Object merge should preserve existing fields and append missing fields. + test( + r#"{"foo":"x"}"#, + &mut JsonType::new_json2(JsonNativeType::Object(JsonObjectType::from([( + "bar".to_string(), + JsonNativeType::i64(), + )]))), + Ok(r#"{"bar":"","foo":""}"#), + )?; - // can merge with some complex nested json object: - let json = r#"{ - "list": [4], - "object": {"foo": "bar", "l": ["x"], "o": {"key": "value"}}, - "float": 0.456, - "int": 0 - }"#; - let expected = r#"Json<{"float":"","hello":"","int":"","list":[""],"object":{"a":"","foo":"","l":[""],"o":{"key":""}}}>"#; - test(json, json_type, Ok(expected))?; + // Conflicting object field types should only lift that field to Variant. + test( + r#"{"foo":1}"#, + &mut JsonType::new_json2(JsonNativeType::Object(JsonObjectType::from([( + "foo".to_string(), + JsonNativeType::Bool, + )]))), + Ok(r#"{"foo":""}"#), + )?; + + // Nested objects should merge recursively. + test( + r#"{"nested":{"foo":"bar"}}"#, + &mut JsonType::new_json2(JsonNativeType::Object(JsonObjectType::from([( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "bar".to_string(), + JsonNativeType::Bool, + )])), + )]))), + Ok(r#"{"nested":{"bar":"","foo":""}}"#), + )?; + + // Arrays should merge their element types recursively. + test( + r#"["foo"]"#, + &mut JsonType::new_json2(JsonNativeType::Array(Box::new(JsonNativeType::u64()))), + Ok(r#"[""]"#), + )?; + + // Root-level incompatible types should be lifted to Variant. + test( + r#"{"foo":"bar"}"#, + &mut JsonType::new_json2(JsonNativeType::Bool), + Ok(r#""""#), + )?; + + // Jsonb and Json2 should not be mergeable. + test( + "true", + &mut JsonType::new(JsonFormat::Jsonb), + Err("Failed to merge JSON datatype: json format not match"), + )?; Ok(()) } diff --git a/src/datatypes/src/vectors/helper.rs b/src/datatypes/src/vectors/helper.rs index 4907bbf380..21bb371617 100644 --- a/src/datatypes/src/vectors/helper.rs +++ b/src/datatypes/src/vectors/helper.rs @@ -462,14 +462,6 @@ impl Helper { } } -#[cfg(test)] -pub(crate) fn pretty_print(vector: VectorRef) -> String { - let array = vector.to_arrow_array(); - arrow::util::pretty::pretty_format_columns(&vector.vector_type_name(), &[array]) - .map(|x| x.to_string()) - .unwrap_or_else(|e| e.to_string()) -} - #[cfg(test)] mod tests { use arrow::array::{ diff --git a/src/datatypes/src/vectors/json/builder.rs b/src/datatypes/src/vectors/json/builder.rs index 58b4073666..3e76f544d7 100644 --- a/src/datatypes/src/vectors/json/builder.rs +++ b/src/datatypes/src/vectors/json/builder.rs @@ -184,7 +184,7 @@ pub(crate) struct JsonVectorBuilder { impl JsonVectorBuilder { pub(crate) fn new(json_type: JsonNativeType, capacity: usize) -> Self { Self { - merged_type: JsonType::new_native(json_type), + merged_type: JsonType::new_json2(json_type), capacity, builders: vec![], } @@ -293,192 +293,3 @@ impl MutableVector for JsonVectorBuilder { .fail() } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::data_type::DataType; - use crate::json::JsonStructureSettings; - use crate::vectors::helper::pretty_print; - - fn push(json: &str, builder: &mut JsonVectorBuilder, expected: std::result::Result<(), &str>) { - let settings = JsonStructureSettings::Structured(None); - let json: serde_json::Value = serde_json::from_str(json).unwrap(); - let value = settings.encode(json).unwrap(); - - let value = value.as_value_ref(); - let result = builder - .try_push_value_ref(&value) - .map_err(|e| e.to_string()); - let expected = expected.map_err(|e| e.to_string()); - assert_eq!(result, expected); - } - - #[test] - fn test_push_plain_jsons() -> Result<()> { - let jsons = vec!["1", "2", r#""s""#, "[true]"]; - let results = vec![ - Ok(()), - Ok(()), - Err( - r#"Failed to merge JSON datatype: datatypes have conflict, this: "", that: """#, - ), - Err( - r#"Failed to merge JSON datatype: datatypes have conflict, this: "", that: [""]"#, - ), - ]; - let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1); - for (json, result) in jsons.into_iter().zip(results) { - push(json, &mut builder, result); - } - let vector = builder.to_vector(); - let expected = r#" -+---------------------+ -| StructVector | -+---------------------+ -| {__json_plain__: 1} | -| {__json_plain__: 2} | -+---------------------+"#; - assert_eq!(pretty_print(vector), expected.trim()); - Ok(()) - } - - #[test] - fn test_push_json_objects() -> Result<()> { - let jsons = vec![ - r#"{ - "s": "a", - "list": [1, 2, 3] - }"#, - r#"{ - "list": [4], - "s": "b" - }"#, - r#"{ - "s": "c", - "float": 0.9 - }"#, - r#"{ - "float": 0.8, - "s": "d" - }"#, - r#"{ - "float": 0.7, - "int": -1 - }"#, - r#"{ - "int": 0, - "float": 0.6 - }"#, - r#"{ - "int": 1, - "object": {"hello": "world", "timestamp": 1761523200000} - }"#, - r#"{ - "object": {"hello": "greptime", "timestamp": 1761523201000}, - "int": 2 - }"#, - r#"{ - "object": {"timestamp": 1761523202000}, - "nested": {"a": {"b": {"b": {"a": "abba"}}}} - }"#, - r#"{ - "nested": {"a": {"b": {"a": {"b": "abab"}}}}, - "object": {"timestamp": 1761523203000} - }"#, - ]; - let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1); - for json in jsons { - push(json, &mut builder, Ok(())); - } - assert_eq!(builder.len(), 10); - - // test children builders: - assert_eq!(builder.builders.len(), 6); - let expect_types = [ - r#"Json<{"list":[""],"s":""}>"#, - r#"Json<{"float":"","s":""}>"#, - r#"Json<{"float":"","int":""}>"#, - r#"Json<{"int":"","object":{"hello":"","timestamp":""}}>"#, - r#"Json<{"nested":{"a":{"b":{"b":{"a":""}}}},"object":{"timestamp":""}}>"#, - r#"Json<{"nested":{"a":{"b":{"a":{"b":""}}}},"object":{"timestamp":""}}>"#, - ]; - let expect_vectors = [ - r#" -+-------------------------+ -| StructVector | -+-------------------------+ -| {list: [1, 2, 3], s: a} | -| {list: [4], s: b} | -+-------------------------+"#, - r#" -+--------------------+ -| StructVector | -+--------------------+ -| {float: 0.9, s: c} | -| {float: 0.8, s: d} | -+--------------------+"#, - r#" -+-----------------------+ -| StructVector | -+-----------------------+ -| {float: 0.7, int: -1} | -| {float: 0.6, int: 0} | -+-----------------------+"#, - r#" -+---------------------------------------------------------------+ -| StructVector | -+---------------------------------------------------------------+ -| {int: 1, object: {hello: world, timestamp: 1761523200000}} | -| {int: 2, object: {hello: greptime, timestamp: 1761523201000}} | -+---------------------------------------------------------------+"#, - r#" -+------------------------------------------------------------------------+ -| StructVector | -+------------------------------------------------------------------------+ -| {nested: {a: {b: {b: {a: abba}}}}, object: {timestamp: 1761523202000}} | -+------------------------------------------------------------------------+"#, - r#" -+------------------------------------------------------------------------+ -| StructVector | -+------------------------------------------------------------------------+ -| {nested: {a: {b: {a: {b: abab}}}}, object: {timestamp: 1761523203000}} | -+------------------------------------------------------------------------+"#, - ]; - for (builder, (expect_type, expect_vector)) in builder - .builders - .iter() - .zip(expect_types.into_iter().zip(expect_vectors)) - { - assert_eq!(builder.json_type.name(), expect_type); - let vector = builder.inner.to_vector_cloned(); - assert_eq!(pretty_print(vector), expect_vector.trim()); - } - - // test final merged json type: - let expected = r#"Json<{"float":"","int":"","list":[""],"nested":{"a":{"b":{"a":{"b":""},"b":{"a":""}}}},"object":{"hello":"","timestamp":""},"s":""}>"#; - assert_eq!(builder.data_type().to_string(), expected); - - // test final produced vector: - let expected = r#" -+-------------------------------------------------------------------------------------------------------------------+ -| StructVector | -+-------------------------------------------------------------------------------------------------------------------+ -| {float: , int: , list: [1, 2, 3], nested: , object: , s: a} | -| {float: , int: , list: [4], nested: , object: , s: b} | -| {float: 0.9, int: , list: , nested: , object: , s: c} | -| {float: 0.8, int: , list: , nested: , object: , s: d} | -| {float: 0.7, int: -1, list: , nested: , object: , s: } | -| {float: 0.6, int: 0, list: , nested: , object: , s: } | -| {float: , int: 1, list: , nested: , object: {hello: world, timestamp: 1761523200000}, s: } | -| {float: , int: 2, list: , nested: , object: {hello: greptime, timestamp: 1761523201000}, s: } | -| {float: , int: , list: , nested: {a: {b: {a: , b: {a: abba}}}}, object: {hello: , timestamp: 1761523202000}, s: } | -| {float: , int: , list: , nested: {a: {b: {a: {b: abab}, b: }}}, object: {hello: , timestamp: 1761523203000}, s: } | -+-------------------------------------------------------------------------------------------------------------------+"#; - let vector = builder.to_vector_cloned(); - assert_eq!(pretty_print(vector), expected.trim()); - let vector = builder.to_vector(); - assert_eq!(pretty_print(vector), expected.trim()); - Ok(()) - } -} diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 92b9dee4b9..2f5bdad50a 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -689,17 +689,16 @@ fn resolve_value( } VrlValue::Array(_) | VrlValue::Object(_) => { - let is_json_native_type = schema_info + let is_json2 = schema_info .find_column_schema_in_table(&column_name) .is_some_and(|x| { - if let ConcreteDataType::Json(column_type) = &x.column_schema.data_type { - column_type.is_native_type() - } else { - false - } + matches!( + &x.column_schema.data_type, + ConcreteDataType::Json(column_type) if column_type.is_json2() + ) }); - let value = if is_json_native_type { + let value = if is_json2 { let json_extension_type: Option = if let Some(x) = schema_info.find_column_schema_in_table(&column_name) { x.column_schema.extension_type()? diff --git a/src/sql/src/statements.rs b/src/sql/src/statements.rs index 211fc5598e..25680a09df 100644 --- a/src/sql/src/statements.rs +++ b/src/sql/src/statements.rs @@ -63,6 +63,7 @@ pub use crate::statements::option_map::OptionMap; pub(crate) use crate::statements::transform::transform_statements; const VECTOR_TYPE_NAME: &str = "VECTOR"; +const JSON2_TYPE_NAME: &str = "JSON2"; pub fn value_to_sql_value(val: &Value) -> Result { Ok(match val { @@ -153,7 +154,16 @@ pub fn column_to_schema( column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some()); - if matches!(column.data_type(), SqlDataType::JSON) { + let is_json2_column = if let SqlDataType::Custom(object_name, _) = column.data_type() { + object_name + .0 + .first() + .map(|x| x.to_string_unquoted().eq_ignore_ascii_case(JSON2_TYPE_NAME)) + .unwrap_or_default() + } else { + false + }; + if is_json2_column || matches!(column.data_type(), SqlDataType::JSON) { let settings = column .extensions .build_json_structure_settings()? @@ -273,39 +283,42 @@ pub fn sql_data_type_to_concrete_data_type( Ok(ConcreteDataType::decimal128_datatype(*p as u8, *s as i8)) } }, - SqlDataType::JSON => { - let format = if let Some(x) = column_extensions.build_json_structure_settings()? { - if let Some(fields) = match x { - JsonStructureSettings::Structured(fields) => fields, - JsonStructureSettings::UnstructuredRaw => None, - JsonStructureSettings::PartialUnstructuredByKey { fields, .. } => fields, - } { - let datatype = &ConcreteDataType::Struct(fields); - JsonFormat::Native(Box::new(datatype.into())) - } else { - JsonFormat::Native(Box::new(JsonNativeType::Null)) + SqlDataType::JSON => Ok(ConcreteDataType::Json(JsonType::new(JsonFormat::Jsonb))), + // Vector type and JSON2 type + SqlDataType::Custom(name, args) if name.0.len() == 1 => { + let name = name.0[0].to_string_unquoted().to_ascii_uppercase(); + match name.as_str() { + VECTOR_TYPE_NAME if args.len() == 1 => { + let dim = &args[0]; + let dim = dim.parse().map_err(|e| { + error::ParseSqlValueSnafu { + msg: format!("Failed to parse vector dimension '{}': {}", dim, e), + } + .build() + })?; + Ok(ConcreteDataType::vector_datatype(dim)) } - } else { - JsonFormat::Jsonb - }; - Ok(ConcreteDataType::Json(JsonType::new(format))) - } - // Vector type - SqlDataType::Custom(name, d) - if name.0.as_slice().len() == 1 - && name.0.as_slice()[0] - .to_string_unquoted() - .to_ascii_uppercase() - == VECTOR_TYPE_NAME - && d.len() == 1 => - { - let dim = d[0].parse().map_err(|e| { - error::ParseSqlValueSnafu { - msg: format!("Failed to parse vector dimension: {}", e), + JSON2_TYPE_NAME if args.is_empty() => { + let native_type = column_extensions + .build_json_structure_settings()? + .and_then(|x| match x { + JsonStructureSettings::Structured(Some(fields)) + | JsonStructureSettings::PartialUnstructuredByKey { + fields: Some(fields), + .. + } => Some(JsonNativeType::from(&ConcreteDataType::Struct(fields))), + JsonStructureSettings::UnstructuredRaw => Some(JsonNativeType::Variant), + _ => None, + }) + .unwrap_or(JsonNativeType::Null); + let format = JsonFormat::Json2(Box::new(native_type)); + Ok(ConcreteDataType::Json(JsonType::new(format))) } - .build() - })?; - Ok(ConcreteDataType::vector_datatype(dim)) + _ => error::SqlTypeNotSupportedSnafu { + t: data_type.clone(), + } + .fail(), + } } _ => error::SqlTypeNotSupportedSnafu { t: data_type.clone(), diff --git a/src/sql/src/statements/create.rs b/src/sql/src/statements/create.rs index 80eb52c406..f54fee1844 100644 --- a/src/sql/src/statements/create.rs +++ b/src/sql/src/statements/create.rs @@ -377,32 +377,35 @@ impl ColumnExtensions { None }; - options + let format = options .get(JSON_OPT_FORMAT) - .map(|format| match format { - JSON_FORMAT_FULL_STRUCTURED => Ok(JsonStructureSettings::Structured(fields)), - JSON_FORMAT_PARTIAL => { - let fields = fields.map(|fields| { - let mut fields = Arc::unwrap_or_clone(fields.fields()); - fields.push(datatypes::types::StructField::new( - JsonStructureSettings::RAW_FIELD.to_string(), - ConcreteDataType::string_datatype(), - true, - )); - StructType::new(Arc::new(fields)) - }); - Ok(JsonStructureSettings::PartialUnstructuredByKey { - fields, - unstructured_keys, - }) + .unwrap_or(JSON_FORMAT_FULL_STRUCTURED); + let settings = match format { + JSON_FORMAT_FULL_STRUCTURED => JsonStructureSettings::Structured(fields), + JSON_FORMAT_PARTIAL => { + let fields = fields.map(|fields| { + let mut fields = Arc::unwrap_or_clone(fields.fields()); + fields.push(datatypes::types::StructField::new( + JsonStructureSettings::RAW_FIELD.to_string(), + ConcreteDataType::string_datatype(), + true, + )); + StructType::new(Arc::new(fields)) + }); + JsonStructureSettings::PartialUnstructuredByKey { + fields, + unstructured_keys, } - JSON_FORMAT_RAW => Ok(JsonStructureSettings::UnstructuredRaw), - _ => InvalidSqlSnafu { + } + JSON_FORMAT_RAW => JsonStructureSettings::UnstructuredRaw, + _ => { + return InvalidSqlSnafu { msg: format!("unknown JSON datatype 'format': {format}"), } - .fail(), - }) - .transpose() + .fail(); + } + }; + Ok(Some(settings)) } pub fn set_json_structure_settings(&mut self, settings: JsonStructureSettings) { diff --git a/tests-integration/tests/jsonbench.rs b/tests-integration/tests/jsonbench.rs index 60f699c4ce..55cfcd53f0 100644 --- a/tests-integration/tests/jsonbench.rs +++ b/tests-integration/tests/jsonbench.rs @@ -25,6 +25,8 @@ use servers::server::ServerHandlers; use tests_integration::standalone::GreptimeDbStandaloneBuilder; use tests_integration::test_util::execute_sql_and_expect; +// TODO(LFC): Unignore the test when JSON2 is ready. +#[ignore] #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn test_load_jsonbench_data_by_pipeline() -> io::Result<()> { common_telemetry::init_default_ut_logging(); @@ -121,6 +123,8 @@ transform: assert!(response.starts_with(pattern)); } +// TODO(LFC): Unignore the test when JSON2 is ready. +#[ignore] #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn test_load_jsonbench_data_by_sql() -> io::Result<()> { common_telemetry::init_default_ut_logging(); diff --git a/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.result b/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.result index 0bd2acfa4f..f376dfe59c 100644 --- a/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.result +++ b/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.result @@ -125,29 +125,26 @@ Affected Rows: 0 CREATE TABLE promql_instant_mismatch_nested ( ts TIMESTAMP(3) TIME INDEX, k STRING PRIMARY KEY, - v JSON(format = "structured"), + v JSON2, ); Affected Rows: 0 -INSERT INTO promql_instant_mismatch_nested VALUES - (0, 'a', '{"x": 1}'), - (1000, 'a', '{"x": 2}'); - -Affected Rows: 2 - +-- TODO(LFC): Uncomment the following SQLs and results when JSON2 is ready. +-- INSERT INTO promql_instant_mismatch_nested VALUES +-- (0, 'a', '{"x": 1}'), +-- (1000, 'a', '{"x": 2}'); +-- Affected Rows: 0 -- This used to error due to InstantManipulateExec returning `column_statistics` sized by -- `schema.flattened_fields().len()` when the schema contains nested fields (Arrow Struct). -- SQLNESS SORT_RESULT 3 1 -TQL EVAL (0, 1, '1s') promql_instant_mismatch_nested == promql_instant_mismatch_nested; - -+---------------------+--------+---+ -| ts | v | k | -+---------------------+--------+---+ -| 1970-01-01T00:00:00 | {x: 1} | a | -| 1970-01-01T00:00:01 | {x: 2} | a | -+---------------------+--------+---+ - +-- TQL EVAL (0, 1, '1s') promql_instant_mismatch_nested == promql_instant_mismatch_nested; +-- +---------------------+--------+---+ +-- | ts | v | k | +-- +---------------------+--------+---+ +-- | 1970-01-01T00:00:00 | {x: 1} | a | +-- | 1970-01-01T00:00:01 | {x: 2} | a | +-- +---------------------+--------+---+ DROP TABLE promql_instant_mismatch_nested; Affected Rows: 0 diff --git a/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.sql b/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.sql index 372d0387e0..db69796e74 100644 --- a/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.sql +++ b/tests/cases/standalone/common/promql/stats_schema_mismatch_regression.sql @@ -107,16 +107,27 @@ DROP TABLE promql_stats_mismatch_physical; CREATE TABLE promql_instant_mismatch_nested ( ts TIMESTAMP(3) TIME INDEX, k STRING PRIMARY KEY, - v JSON(format = "structured"), + v JSON2, ); -INSERT INTO promql_instant_mismatch_nested VALUES - (0, 'a', '{"x": 1}'), - (1000, 'a', '{"x": 2}'); +-- TODO(LFC): Uncomment the following SQLs and results when JSON2 is ready. + +-- INSERT INTO promql_instant_mismatch_nested VALUES +-- (0, 'a', '{"x": 1}'), +-- (1000, 'a', '{"x": 2}'); + +-- Affected Rows: 0 -- This used to error due to InstantManipulateExec returning `column_statistics` sized by -- `schema.flattened_fields().len()` when the schema contains nested fields (Arrow Struct). -- SQLNESS SORT_RESULT 3 1 -TQL EVAL (0, 1, '1s') promql_instant_mismatch_nested == promql_instant_mismatch_nested; +-- TQL EVAL (0, 1, '1s') promql_instant_mismatch_nested == promql_instant_mismatch_nested; + +-- +---------------------+--------+---+ +-- | ts | v | k | +-- +---------------------+--------+---+ +-- | 1970-01-01T00:00:00 | {x: 1} | a | +-- | 1970-01-01T00:00:01 | {x: 2} | a | +-- +---------------------+--------+---+ DROP TABLE promql_instant_mismatch_nested; diff --git a/tests/cases/standalone/common/types/json/json-structured.result b/tests/cases/standalone/common/types/json/json-structured.result deleted file mode 100644 index be04e2652d..0000000000 --- a/tests/cases/standalone/common/types/json/json-structured.result +++ /dev/null @@ -1,82 +0,0 @@ -CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured") DEFAULT '{"foo": "bar"}'); - -Error: 1001(Unsupported), Unsupported default constraint for column: 'j', reason: json column cannot have a default value - -CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured")); - -Affected Rows: 0 - -DESC TABLE t; - -+--------+----------------------+-----+------+---------+---------------+ -| Column | Type | Key | Null | Default | Semantic Type | -+--------+----------------------+-----+------+---------+---------------+ -| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP | -| j | Json<""> | | YES | | FIELD | -+--------+----------------------+-----+------+---------+---------------+ - -INSERT INTO t VALUES -(1762128001000, '{"int": 1}'), -(1762128002000, '{"int": 2, "list": [0.1, 0.2, 0.3]}'), -(1762128003000, '{"int": 3, "list": [0.4, 0.5, 0.6], "nested": {"a": {"x": "hello"}, "b": {"y": -1}}}'); - -Affected Rows: 3 - -DESC TABLE t; - -+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ -| Column | Type | Key | Null | Default | Semantic Type | -+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ -| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP | -| j | Json<{"int":"","list":[""],"nested":{"a":{"x":""},"b":{"y":""}}}> | | YES | | FIELD | -+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ - -INSERT INTO t VALUES -(1762128004000, '{"int": 4, "bool": true, "nested": {"a": {"y": 1}}}'), -(1762128005000, '{"int": 5, "bool": false, "nested": {"b": {"x": "world"}}}'); - -Affected Rows: 2 - -DESC TABLE t; - -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ -| Column | Type | Key | Null | Default | Semantic Type | -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ -| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP | -| j | Json<{"bool":"","int":"","list":[""],"nested":{"a":{"x":"","y":""},"b":{"x":"","y":""}}}> | | YES | | FIELD | -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ - -INSERT INTO t VALUES (1762128006000, '{"int": 6, "list": [-6.0], "bool": true, "nested": {"a": {"x": "ax", "y": 66}, "b": {"y": -66, "x": "bx"}}}'); - -Affected Rows: 1 - -DESC TABLE t; - -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ -| Column | Type | Key | Null | Default | Semantic Type | -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ -| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP | -| j | Json<{"bool":"","int":"","list":[""],"nested":{"a":{"x":"","y":""},"b":{"x":"","y":""}}}> | | YES | | FIELD | -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+ - -INSERT INTO t VALUES (1762128011000, '{}'); - -Error: 1004(InvalidArguments), Invalid InsertRequest, reason: empty json object is not supported, consider adding a dummy field - -SELECT ts, j FROM t order by ts; - -+---------------------+----------------------------------------------------------------------------------------+ -| ts | j | -+---------------------+----------------------------------------------------------------------------------------+ -| 2025-11-03T00:00:01 | {bool: , int: 1, list: , nested: } | -| 2025-11-03T00:00:02 | {bool: , int: 2, list: [0.1, 0.2, 0.3], nested: } | -| 2025-11-03T00:00:03 | {bool: , int: 3, list: [0.4, 0.5, 0.6], nested: {a: {x: hello, y: }, b: {x: , y: -1}}} | -| 2025-11-03T00:00:04 | {bool: true, int: 4, list: , nested: {a: {x: , y: 1}, b: }} | -| 2025-11-03T00:00:05 | {bool: false, int: 5, list: , nested: {a: , b: {x: world, y: }}} | -| 2025-11-03T00:00:06 | {bool: true, int: 6, list: [-6.0], nested: {a: {x: ax, y: 66}, b: {x: bx, y: -66}}} | -+---------------------+----------------------------------------------------------------------------------------+ - -DROP table t; - -Affected Rows: 0 - diff --git a/tests/cases/standalone/common/types/json/json-structured.sql b/tests/cases/standalone/common/types/json/json-structured.sql deleted file mode 100644 index 8bb10b4b0e..0000000000 --- a/tests/cases/standalone/common/types/json/json-structured.sql +++ /dev/null @@ -1,28 +0,0 @@ -CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured") DEFAULT '{"foo": "bar"}'); - -CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured")); - -DESC TABLE t; - -INSERT INTO t VALUES -(1762128001000, '{"int": 1}'), -(1762128002000, '{"int": 2, "list": [0.1, 0.2, 0.3]}'), -(1762128003000, '{"int": 3, "list": [0.4, 0.5, 0.6], "nested": {"a": {"x": "hello"}, "b": {"y": -1}}}'); - -DESC TABLE t; - -INSERT INTO t VALUES -(1762128004000, '{"int": 4, "bool": true, "nested": {"a": {"y": 1}}}'), -(1762128005000, '{"int": 5, "bool": false, "nested": {"b": {"x": "world"}}}'); - -DESC TABLE t; - -INSERT INTO t VALUES (1762128006000, '{"int": 6, "list": [-6.0], "bool": true, "nested": {"a": {"x": "ax", "y": 66}, "b": {"y": -66, "x": "bx"}}}'); - -DESC TABLE t; - -INSERT INTO t VALUES (1762128011000, '{}'); - -SELECT ts, j FROM t order by ts; - -DROP table t; diff --git a/tests/cases/standalone/common/types/json/json2.result b/tests/cases/standalone/common/types/json/json2.result new file mode 100644 index 0000000000..d5f54d79c8 --- /dev/null +++ b/tests/cases/standalone/common/types/json/json2.result @@ -0,0 +1,14 @@ +create table json2_table ( + ts timestamp time index, + j json2 +) with ( + 'append_mode' = 'true', + 'sst_format' = 'flat', +); + +Affected Rows: 0 + +drop table json2_table; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/json/json2.sql b/tests/cases/standalone/common/types/json/json2.sql new file mode 100644 index 0000000000..2e3a131385 --- /dev/null +++ b/tests/cases/standalone/common/types/json/json2.sql @@ -0,0 +1,9 @@ +create table json2_table ( + ts timestamp time index, + j json2 +) with ( + 'append_mode' = 'true', + 'sst_format' = 'flat', +); + +drop table json2_table;