feat: udf json_get_object (#7241)

Signed-off-by: luofucong <luofc@foxmail.com>
This commit is contained in:
LFC
2025-11-21 12:50:38 +08:00
committed by GitHub
parent 0b4f00feef
commit c02754b44c
5 changed files with 290 additions and 5 deletions

View File

@@ -19,7 +19,7 @@ mod json_path_match;
mod json_to_string;
mod parse_json;
use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString};
use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetObject, JsonGetString};
use json_is::{
JsonIsArray, JsonIsBool, JsonIsFloat, JsonIsInt, JsonIsNull, JsonIsObject, JsonIsString,
};
@@ -39,6 +39,7 @@ impl JsonFunction {
registry.register_scalar(JsonGetFloat::default());
registry.register_scalar(JsonGetString::default());
registry.register_scalar(JsonGetBool::default());
registry.register_scalar(JsonGetObject::default());
registry.register_scalar(JsonIsNull::default());
registry.register_scalar(JsonIsInt::default());

View File

@@ -16,10 +16,13 @@ use std::fmt::{self, Display};
use std::sync::Arc;
use arrow::compute;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{
Array, AsArray, BooleanBuilder, Float64Builder, Int64Builder, StringViewBuilder,
Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder,
StringViewBuilder,
};
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::type_coercion::aggregates::STRINGS;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature};
use crate::function::{Function, extract_args};
@@ -212,13 +215,92 @@ impl Display for JsonGetString {
}
}
/// Get the object from JSON value by path.
pub(super) struct JsonGetObject {
signature: Signature,
}
impl JsonGetObject {
const NAME: &'static str = "json_get_object";
}
impl Default for JsonGetObject {
fn default() -> Self {
Self {
signature: helper::one_of_sigs2(
vec![
DataType::Binary,
DataType::LargeBinary,
DataType::BinaryView,
],
STRINGS.to_vec(),
),
}
}
}
impl Function for JsonGetObject {
fn name(&self) -> &str {
Self::NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::BinaryView)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let [arg0, arg1] = extract_args(self.name(), &args)?;
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
let paths = arg1.as_string_view();
let len = jsons.len();
let mut builder = BinaryViewBuilder::with_capacity(len);
for i in 0..len {
let json = jsons.is_valid(i).then(|| jsons.value(i));
let path = paths.is_valid(i).then(|| paths.value(i));
let result = if let (Some(json), Some(path)) = (json, path) {
let result = jsonb::jsonpath::parse_json_path(path.as_bytes()).and_then(|path| {
let mut data = Vec::new();
let mut offset = Vec::new();
jsonb::get_by_path(json, path, &mut data, &mut offset)
.map(|()| jsonb::is_object(&data).then_some(data))
});
result.map_err(|e| DataFusionError::Execution(e.to_string()))?
} else {
None
};
builder.append_option(result);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
impl Display for JsonGetObject {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", Self::NAME.to_ascii_uppercase())
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow_schema::Field;
use datafusion_common::arrow::array::{BinaryArray, StringArray};
use datafusion_common::ScalarValue;
use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray};
use datafusion_common::arrow::datatypes::{Float64Type, Int64Type};
use datatypes::types::parse_string_to_jsonb;
use super::*;
@@ -425,4 +507,49 @@ mod tests {
assert_eq!(*gt, result);
}
}
#[test]
fn test_json_get_object() -> datafusion_common::Result<()> {
let udf = JsonGetObject::default();
assert_eq!("json_get_object", udf.name());
assert_eq!(
DataType::BinaryView,
udf.return_type(&[DataType::BinaryView, DataType::Utf8View])?
);
let json_value = parse_string_to_jsonb(r#"{"a": {"b": {"c": {"d": 1}}}}"#).unwrap();
let paths = vec!["$", "$.a", "$.a.b", "$.a.b.c", "$.a.b.c.d", "$.e", "$.a.e"];
let number_rows = paths.len();
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Scalar(ScalarValue::Binary(Some(json_value))),
ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))),
],
arg_fields: vec![],
number_rows,
return_field: Arc::new(Field::new("x", DataType::Binary, false)),
config_options: Arc::new(Default::default()),
};
let result = udf
.invoke_with_args(args)
.and_then(|x| x.to_array(number_rows))?;
let result = result.as_binary_view();
let expected = &BinaryViewArray::from_iter(
vec![
Some(r#"{"a": {"b": {"c": {"d": 1}}}}"#),
Some(r#"{"b": {"c": {"d": 1}}}"#),
Some(r#"{"c": {"d": 1}}"#),
Some(r#"{"d": 1}"#),
None,
None,
None,
]
.into_iter()
.map(|x| x.and_then(|s| parse_string_to_jsonb(s).ok())),
);
assert_eq!(result, expected);
Ok(())
}
}

View File

@@ -32,7 +32,15 @@ impl Default for JsonToStringFunction {
fn default() -> Self {
Self {
// TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type.
signature: Signature::exact(vec![DataType::Binary], Volatility::Immutable),
signature: Signature::uniform(
1,
vec![
DataType::Binary,
DataType::LargeBinary,
DataType::BinaryView,
],
Volatility::Immutable,
),
}
}
}
@@ -57,7 +65,8 @@ impl Function for JsonToStringFunction {
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let [arg0] = extract_args(self.name(), &args)?;
let jsons = arg0.as_binary::<i32>();
let arg0 = arrow::compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
let size = jsons.len();
let mut builder = StringViewBuilder::with_capacity(size);

View File

@@ -47,6 +47,30 @@ SELECT json_get_string(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b');
| |
+--------------------------------------------------------------------------------+
SELECT json_to_string(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'));
+---------------------------------------------------------------------------------------------------+
| json_to_string(json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.b.c"))) |
+---------------------------------------------------------------------------------------------------+
| {"d":42} |
+---------------------------------------------------------------------------------------------------+
SELECT json_get_int(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'), 'd');
+-----------------------------------------------------------------------------------------------------------+
| json_get_int(json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.b.c")),Utf8("d")) |
+-----------------------------------------------------------------------------------------------------------+
| 42 |
+-----------------------------------------------------------------------------------------------------------+
SELECT json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.e');
+---------------------------------------------------------------------------------+
| json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.e")) |
+---------------------------------------------------------------------------------+
| |
+---------------------------------------------------------------------------------+
-- test functions with table rows --
CREATE TABLE jsons(j JSON, ts timestamp time index);
@@ -123,6 +147,39 @@ SELECT json_get_int(j, 'a.b["c"]') FROM jsons;
| 1 |
+----------------------------------------+
SELECT json_to_string(json_get_object(j, 'a.b')) FROM jsons;
+------------------------------------------------------+
| json_to_string(json_get_object(jsons.j,Utf8("a.b"))) |
+------------------------------------------------------+
| {"c":1} |
| {"c":1.234} |
| {"c":"foo"} |
| {"c":true} |
+------------------------------------------------------+
SELECT json_get_string(json_get_object(j, 'a.b'), 'c') FROM jsons;
+-----------------------------------------------------------------+
| json_get_string(json_get_object(jsons.j,Utf8("a.b")),Utf8("c")) |
+-----------------------------------------------------------------+
| 1 |
| 1.234 |
| foo |
| true |
+-----------------------------------------------------------------+
SELECT json_get_object(j, 'a.x') FROM jsons;
+--------------------------------------+
| json_get_object(jsons.j,Utf8("a.x")) |
+--------------------------------------+
| |
| |
| |
| |
+--------------------------------------+
DROP TABLE jsons;
Affected Rows: 0
@@ -148,6 +205,10 @@ INSERT INTO jsons VALUES(parse_json('[1.2, 3.1415926535897932384626, -3e123, 1e1
Affected Rows: 1
INSERT INTO jsons VALUES(parse_json('[{"a": {"i": 1}}, {"a": {"i": 2}}, {"a": {"i": 3}}]'), 5);
Affected Rows: 1
SELECT json_get_int(j, '[0]') FROM jsons;
+-----------------------------------+
@@ -157,6 +218,7 @@ SELECT json_get_int(j, '[0]') FROM jsons;
| 1 |
| 1 |
| |
| |
+-----------------------------------+
SELECT json_get_float(j, '[1]') FROM jsons;
@@ -168,6 +230,7 @@ SELECT json_get_float(j, '[1]') FROM jsons;
| 0.0 |
| 0.0 |
| 3.141592653589793 |
| |
+-------------------------------------+
SELECT json_get_bool(j, '[2]') FROM jsons;
@@ -179,6 +242,7 @@ SELECT json_get_bool(j, '[2]') FROM jsons;
| false |
| |
| |
| |
+------------------------------------+
SELECT json_get_string(j, '[3]') FROM jsons;
@@ -190,8 +254,45 @@ SELECT json_get_string(j, '[3]') FROM jsons;
| false |
| 2147483648 |
| 1e100 |
| |
+--------------------------------------------------------+
SELECT json_to_string(json_get_object(j, '[0]')) FROM jsons;
+------------------------------------------------------+
| json_to_string(json_get_object(jsons.j,Utf8("[0]"))) |
+------------------------------------------------------+
| |
| |
| |
| |
| {"a":{"i":1}} |
+------------------------------------------------------+
SELECT json_get_int(json_get_object(j, '[0]'), 'a.i') FROM jsons;
+----------------------------------------------------------------+
| json_get_int(json_get_object(jsons.j,Utf8("[0]")),Utf8("a.i")) |
+----------------------------------------------------------------+
| |
| |
| |
| |
| 1 |
+----------------------------------------------------------------+
SELECT json_get_int(json_get_object(j, '[9]'), 'a.i') FROM jsons;
+----------------------------------------------------------------+
| json_get_int(json_get_object(jsons.j,Utf8("[9]")),Utf8("a.i")) |
+----------------------------------------------------------------+
| |
| |
| |
| |
| |
+----------------------------------------------------------------+
DROP TABLE jsons;
Affected Rows: 0
@@ -259,6 +360,27 @@ SELECT json_to_string(j) FROM jsons WHERE CAST(json_get_int(j, 'a.b.c') AS BOOLE
| {"a":{"b":{"c":true}}} |
+-------------------------+
SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.b'), 'c') == 'foo';
+-------------------------+
| json_to_string(jsons.j) |
+-------------------------+
| {"a":{"b":{"c":"foo"}}} |
+-------------------------+
SELECT json_to_string(j) FROM jsons WHERE json_to_string(json_get_object(j, 'a.b')) == '{"c":1}';
+-------------------------+
| json_to_string(jsons.j) |
+-------------------------+
| {"a":{"b":{"c":1}}} |
+-------------------------+
SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.x'), 'c') == 'foo';
++
++
DROP TABLE jsons;
Affected Rows: 0

View File

@@ -11,6 +11,12 @@ SELECT json_get_int(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b');
SELECT json_get_string(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b');
SELECT json_to_string(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'));
SELECT json_get_int(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'), 'd');
SELECT json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.e');
-- test functions with table rows --
CREATE TABLE jsons(j JSON, ts timestamp time index);
@@ -32,6 +38,12 @@ SELECT json_get_bool(j, 'a.b.c') FROM jsons;
SELECT json_get_int(j, 'a.b["c"]') FROM jsons;
SELECT json_to_string(json_get_object(j, 'a.b')) FROM jsons;
SELECT json_get_string(json_get_object(j, 'a.b'), 'c') FROM jsons;
SELECT json_get_object(j, 'a.x') FROM jsons;
DROP TABLE jsons;
-- test functions with arrays --
@@ -45,6 +57,8 @@ INSERT INTO jsons VALUES(parse_json('[1, 0, -2147483649, 2147483648]'), 3);
INSERT INTO jsons VALUES(parse_json('[1.2, 3.1415926535897932384626, -3e123, 1e100]'), 4);
INSERT INTO jsons VALUES(parse_json('[{"a": {"i": 1}}, {"a": {"i": 2}}, {"a": {"i": 3}}]'), 5);
SELECT json_get_int(j, '[0]') FROM jsons;
SELECT json_get_float(j, '[1]') FROM jsons;
@@ -53,6 +67,12 @@ SELECT json_get_bool(j, '[2]') FROM jsons;
SELECT json_get_string(j, '[3]') FROM jsons;
SELECT json_to_string(json_get_object(j, '[0]')) FROM jsons;
SELECT json_get_int(json_get_object(j, '[0]'), 'a.i') FROM jsons;
SELECT json_get_int(json_get_object(j, '[9]'), 'a.i') FROM jsons;
DROP TABLE jsons;
-- test functions in WHERE clause --
@@ -76,4 +96,10 @@ SELECT json_to_string(j) FROM jsons WHERE json_get_bool(j, 'a.b.c') = true;
SELECT json_to_string(j) FROM jsons WHERE CAST(json_get_int(j, 'a.b.c') AS BOOLEAN);
SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.b'), 'c') == 'foo';
SELECT json_to_string(j) FROM jsons WHERE json_to_string(json_get_object(j, 'a.b')) == '{"c":1}';
SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.x'), 'c') == 'foo';
DROP TABLE jsons;