From c02754b44c80b2ef59be24af06a1578ab144efce Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 21 Nov 2025 12:50:38 +0800 Subject: [PATCH] feat: udf `json_get_object` (#7241) Signed-off-by: luofucong --- src/common/function/src/scalars/json.rs | 3 +- .../function/src/scalars/json/json_get.rs | 131 +++++++++++++++++- .../src/scalars/json/json_to_string.rs | 13 +- .../common/function/json/json_get.result | 122 ++++++++++++++++ .../common/function/json/json_get.sql | 26 ++++ 5 files changed, 290 insertions(+), 5 deletions(-) diff --git a/src/common/function/src/scalars/json.rs b/src/common/function/src/scalars/json.rs index 9b022d71da..f84937fa0f 100644 --- a/src/common/function/src/scalars/json.rs +++ b/src/common/function/src/scalars/json.rs @@ -19,7 +19,7 @@ mod json_path_match; mod json_to_string; mod parse_json; -use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString}; +use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetObject, JsonGetString}; use json_is::{ JsonIsArray, JsonIsBool, JsonIsFloat, JsonIsInt, JsonIsNull, JsonIsObject, JsonIsString, }; @@ -39,6 +39,7 @@ impl JsonFunction { registry.register_scalar(JsonGetFloat::default()); registry.register_scalar(JsonGetString::default()); registry.register_scalar(JsonGetBool::default()); + registry.register_scalar(JsonGetObject::default()); registry.register_scalar(JsonIsNull::default()); registry.register_scalar(JsonIsInt::default()); diff --git a/src/common/function/src/scalars/json/json_get.rs b/src/common/function/src/scalars/json/json_get.rs index 51dd2fc9b7..92ea9cf990 100644 --- a/src/common/function/src/scalars/json/json_get.rs +++ b/src/common/function/src/scalars/json/json_get.rs @@ -16,10 +16,13 @@ use std::fmt::{self, Display}; use std::sync::Arc; use arrow::compute; +use datafusion_common::DataFusionError; use datafusion_common::arrow::array::{ - Array, AsArray, BooleanBuilder, Float64Builder, Int64Builder, StringViewBuilder, + Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder, + StringViewBuilder, }; use datafusion_common::arrow::datatypes::DataType; +use datafusion_expr::type_coercion::aggregates::STRINGS; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature}; use crate::function::{Function, extract_args}; @@ -212,13 +215,92 @@ impl Display for JsonGetString { } } +/// Get the object from JSON value by path. +pub(super) struct JsonGetObject { + signature: Signature, +} + +impl JsonGetObject { + const NAME: &'static str = "json_get_object"; +} + +impl Default for JsonGetObject { + fn default() -> Self { + Self { + signature: helper::one_of_sigs2( + vec![ + DataType::Binary, + DataType::LargeBinary, + DataType::BinaryView, + ], + STRINGS.to_vec(), + ), + } + } +} + +impl Function for JsonGetObject { + fn name(&self) -> &str { + Self::NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::BinaryView) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let [arg0, arg1] = extract_args(self.name(), &args)?; + let arg0 = compute::cast(&arg0, &DataType::BinaryView)?; + let jsons = arg0.as_binary_view(); + let arg1 = compute::cast(&arg1, &DataType::Utf8View)?; + let paths = arg1.as_string_view(); + + let len = jsons.len(); + let mut builder = BinaryViewBuilder::with_capacity(len); + + for i in 0..len { + let json = jsons.is_valid(i).then(|| jsons.value(i)); + let path = paths.is_valid(i).then(|| paths.value(i)); + let result = if let (Some(json), Some(path)) = (json, path) { + let result = jsonb::jsonpath::parse_json_path(path.as_bytes()).and_then(|path| { + let mut data = Vec::new(); + let mut offset = Vec::new(); + jsonb::get_by_path(json, path, &mut data, &mut offset) + .map(|()| jsonb::is_object(&data).then_some(data)) + }); + result.map_err(|e| DataFusionError::Execution(e.to_string()))? + } else { + None + }; + builder.append_option(result); + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +impl Display for JsonGetObject { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", Self::NAME.to_ascii_uppercase()) + } +} + #[cfg(test)] mod tests { use std::sync::Arc; use arrow_schema::Field; - use datafusion_common::arrow::array::{BinaryArray, StringArray}; + use datafusion_common::ScalarValue; + use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray}; use datafusion_common::arrow::datatypes::{Float64Type, Int64Type}; + use datatypes::types::parse_string_to_jsonb; use super::*; @@ -425,4 +507,49 @@ mod tests { assert_eq!(*gt, result); } } + + #[test] + fn test_json_get_object() -> datafusion_common::Result<()> { + let udf = JsonGetObject::default(); + assert_eq!("json_get_object", udf.name()); + assert_eq!( + DataType::BinaryView, + udf.return_type(&[DataType::BinaryView, DataType::Utf8View])? + ); + + let json_value = parse_string_to_jsonb(r#"{"a": {"b": {"c": {"d": 1}}}}"#).unwrap(); + let paths = vec!["$", "$.a", "$.a.b", "$.a.b.c", "$.a.b.c.d", "$.e", "$.a.e"]; + let number_rows = paths.len(); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Binary(Some(json_value))), + ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))), + ], + arg_fields: vec![], + number_rows, + return_field: Arc::new(Field::new("x", DataType::Binary, false)), + config_options: Arc::new(Default::default()), + }; + let result = udf + .invoke_with_args(args) + .and_then(|x| x.to_array(number_rows))?; + let result = result.as_binary_view(); + + let expected = &BinaryViewArray::from_iter( + vec![ + Some(r#"{"a": {"b": {"c": {"d": 1}}}}"#), + Some(r#"{"b": {"c": {"d": 1}}}"#), + Some(r#"{"c": {"d": 1}}"#), + Some(r#"{"d": 1}"#), + None, + None, + None, + ] + .into_iter() + .map(|x| x.and_then(|s| parse_string_to_jsonb(s).ok())), + ); + assert_eq!(result, expected); + Ok(()) + } } diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs index ae134b75dd..6c0cc260b2 100644 --- a/src/common/function/src/scalars/json/json_to_string.rs +++ b/src/common/function/src/scalars/json/json_to_string.rs @@ -32,7 +32,15 @@ impl Default for JsonToStringFunction { fn default() -> Self { Self { // TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type. - signature: Signature::exact(vec![DataType::Binary], Volatility::Immutable), + signature: Signature::uniform( + 1, + vec![ + DataType::Binary, + DataType::LargeBinary, + DataType::BinaryView, + ], + Volatility::Immutable, + ), } } } @@ -57,7 +65,8 @@ impl Function for JsonToStringFunction { args: ScalarFunctionArgs, ) -> datafusion_common::Result { let [arg0] = extract_args(self.name(), &args)?; - let jsons = arg0.as_binary::(); + let arg0 = arrow::compute::cast(&arg0, &DataType::BinaryView)?; + let jsons = arg0.as_binary_view(); let size = jsons.len(); let mut builder = StringViewBuilder::with_capacity(size); diff --git a/tests/cases/standalone/common/function/json/json_get.result b/tests/cases/standalone/common/function/json/json_get.result index 01767387a9..5f17415d0c 100644 --- a/tests/cases/standalone/common/function/json/json_get.result +++ b/tests/cases/standalone/common/function/json/json_get.result @@ -47,6 +47,30 @@ SELECT json_get_string(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); | | +--------------------------------------------------------------------------------+ +SELECT json_to_string(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c')); + ++---------------------------------------------------------------------------------------------------+ +| json_to_string(json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.b.c"))) | ++---------------------------------------------------------------------------------------------------+ +| {"d":42} | ++---------------------------------------------------------------------------------------------------+ + +SELECT json_get_int(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'), 'd'); + ++-----------------------------------------------------------------------------------------------------------+ +| json_get_int(json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.b.c")),Utf8("d")) | ++-----------------------------------------------------------------------------------------------------------+ +| 42 | ++-----------------------------------------------------------------------------------------------------------+ + +SELECT json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.e'); + ++---------------------------------------------------------------------------------+ +| json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.e")) | ++---------------------------------------------------------------------------------+ +| | ++---------------------------------------------------------------------------------+ + -- test functions with table rows -- CREATE TABLE jsons(j JSON, ts timestamp time index); @@ -123,6 +147,39 @@ SELECT json_get_int(j, 'a.b["c"]') FROM jsons; | 1 | +----------------------------------------+ +SELECT json_to_string(json_get_object(j, 'a.b')) FROM jsons; + ++------------------------------------------------------+ +| json_to_string(json_get_object(jsons.j,Utf8("a.b"))) | ++------------------------------------------------------+ +| {"c":1} | +| {"c":1.234} | +| {"c":"foo"} | +| {"c":true} | ++------------------------------------------------------+ + +SELECT json_get_string(json_get_object(j, 'a.b'), 'c') FROM jsons; + ++-----------------------------------------------------------------+ +| json_get_string(json_get_object(jsons.j,Utf8("a.b")),Utf8("c")) | ++-----------------------------------------------------------------+ +| 1 | +| 1.234 | +| foo | +| true | ++-----------------------------------------------------------------+ + +SELECT json_get_object(j, 'a.x') FROM jsons; + ++--------------------------------------+ +| json_get_object(jsons.j,Utf8("a.x")) | ++--------------------------------------+ +| | +| | +| | +| | ++--------------------------------------+ + DROP TABLE jsons; Affected Rows: 0 @@ -148,6 +205,10 @@ INSERT INTO jsons VALUES(parse_json('[1.2, 3.1415926535897932384626, -3e123, 1e1 Affected Rows: 1 +INSERT INTO jsons VALUES(parse_json('[{"a": {"i": 1}}, {"a": {"i": 2}}, {"a": {"i": 3}}]'), 5); + +Affected Rows: 1 + SELECT json_get_int(j, '[0]') FROM jsons; +-----------------------------------+ @@ -157,6 +218,7 @@ SELECT json_get_int(j, '[0]') FROM jsons; | 1 | | 1 | | | +| | +-----------------------------------+ SELECT json_get_float(j, '[1]') FROM jsons; @@ -168,6 +230,7 @@ SELECT json_get_float(j, '[1]') FROM jsons; | 0.0 | | 0.0 | | 3.141592653589793 | +| | +-------------------------------------+ SELECT json_get_bool(j, '[2]') FROM jsons; @@ -179,6 +242,7 @@ SELECT json_get_bool(j, '[2]') FROM jsons; | false | | | | | +| | +------------------------------------+ SELECT json_get_string(j, '[3]') FROM jsons; @@ -190,8 +254,45 @@ SELECT json_get_string(j, '[3]') FROM jsons; | false | | 2147483648 | | 1e100 | +| | +--------------------------------------------------------+ +SELECT json_to_string(json_get_object(j, '[0]')) FROM jsons; + ++------------------------------------------------------+ +| json_to_string(json_get_object(jsons.j,Utf8("[0]"))) | ++------------------------------------------------------+ +| | +| | +| | +| | +| {"a":{"i":1}} | ++------------------------------------------------------+ + +SELECT json_get_int(json_get_object(j, '[0]'), 'a.i') FROM jsons; + ++----------------------------------------------------------------+ +| json_get_int(json_get_object(jsons.j,Utf8("[0]")),Utf8("a.i")) | ++----------------------------------------------------------------+ +| | +| | +| | +| | +| 1 | ++----------------------------------------------------------------+ + +SELECT json_get_int(json_get_object(j, '[9]'), 'a.i') FROM jsons; + ++----------------------------------------------------------------+ +| json_get_int(json_get_object(jsons.j,Utf8("[9]")),Utf8("a.i")) | ++----------------------------------------------------------------+ +| | +| | +| | +| | +| | ++----------------------------------------------------------------+ + DROP TABLE jsons; Affected Rows: 0 @@ -259,6 +360,27 @@ SELECT json_to_string(j) FROM jsons WHERE CAST(json_get_int(j, 'a.b.c') AS BOOLE | {"a":{"b":{"c":true}}} | +-------------------------+ +SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.b'), 'c') == 'foo'; + ++-------------------------+ +| json_to_string(jsons.j) | ++-------------------------+ +| {"a":{"b":{"c":"foo"}}} | ++-------------------------+ + +SELECT json_to_string(j) FROM jsons WHERE json_to_string(json_get_object(j, 'a.b')) == '{"c":1}'; + ++-------------------------+ +| json_to_string(jsons.j) | ++-------------------------+ +| {"a":{"b":{"c":1}}} | ++-------------------------+ + +SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.x'), 'c') == 'foo'; + +++ +++ + DROP TABLE jsons; Affected Rows: 0 diff --git a/tests/cases/standalone/common/function/json/json_get.sql b/tests/cases/standalone/common/function/json/json_get.sql index 3247536b07..010a3bd7a7 100644 --- a/tests/cases/standalone/common/function/json/json_get.sql +++ b/tests/cases/standalone/common/function/json/json_get.sql @@ -11,6 +11,12 @@ SELECT json_get_int(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); SELECT json_get_string(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); +SELECT json_to_string(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c')); + +SELECT json_get_int(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'), 'd'); + +SELECT json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.e'); + -- test functions with table rows -- CREATE TABLE jsons(j JSON, ts timestamp time index); @@ -32,6 +38,12 @@ SELECT json_get_bool(j, 'a.b.c') FROM jsons; SELECT json_get_int(j, 'a.b["c"]') FROM jsons; +SELECT json_to_string(json_get_object(j, 'a.b')) FROM jsons; + +SELECT json_get_string(json_get_object(j, 'a.b'), 'c') FROM jsons; + +SELECT json_get_object(j, 'a.x') FROM jsons; + DROP TABLE jsons; -- test functions with arrays -- @@ -45,6 +57,8 @@ INSERT INTO jsons VALUES(parse_json('[1, 0, -2147483649, 2147483648]'), 3); INSERT INTO jsons VALUES(parse_json('[1.2, 3.1415926535897932384626, -3e123, 1e100]'), 4); +INSERT INTO jsons VALUES(parse_json('[{"a": {"i": 1}}, {"a": {"i": 2}}, {"a": {"i": 3}}]'), 5); + SELECT json_get_int(j, '[0]') FROM jsons; SELECT json_get_float(j, '[1]') FROM jsons; @@ -53,6 +67,12 @@ SELECT json_get_bool(j, '[2]') FROM jsons; SELECT json_get_string(j, '[3]') FROM jsons; +SELECT json_to_string(json_get_object(j, '[0]')) FROM jsons; + +SELECT json_get_int(json_get_object(j, '[0]'), 'a.i') FROM jsons; + +SELECT json_get_int(json_get_object(j, '[9]'), 'a.i') FROM jsons; + DROP TABLE jsons; -- test functions in WHERE clause -- @@ -76,4 +96,10 @@ SELECT json_to_string(j) FROM jsons WHERE json_get_bool(j, 'a.b.c') = true; SELECT json_to_string(j) FROM jsons WHERE CAST(json_get_int(j, 'a.b.c') AS BOOLEAN); +SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.b'), 'c') == 'foo'; + +SELECT json_to_string(j) FROM jsons WHERE json_to_string(json_get_object(j, 'a.b')) == '{"c":1}'; + +SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.x'), 'c') == 'foo'; + DROP TABLE jsons;