feat: add json data type (#4619)

* feat: add json type and vector

* fix: allow to create and insert json data

* feat: udf to query json as string

* refactor: remove JsonbValue and JsonVector

* feat: show json value as strings

* chore: make ci happy

* test: adunit test and sqlness test

* refactor: use binary as grpc value of json

* fix: use non-preserve-order jsonb

* test: revert changed test

* refactor: change udf get_by_path to jq

* chore: make ci happy

* fix: distinguish binary and json in proto

* chore: delete udf for future pr

* refactor: remove Value(Json)

* chore: follow review comments

* test: some tests and checks

* test: fix unit tests

* chore: follow review comments

* chore: corresponding changes to proto

* fix: change grpc and pgsql server behavior alongside with sqlness/crud tests

* chore: follow review comments

* feat: udf of conversions between json and strings, used for grpc server

* refactor: rename to_string to json_to_string

* test: add more sqlness test for json

* chore: thanks for review :)

* Apply suggestions from code review

---------

Co-authored-by: Weny Xu <wenymedia@gmail.com>
This commit is contained in:
Yohan Wal
2024-09-09 19:41:36 +08:00
committed by GitHub
parent dc89944570
commit 04e7dd6fd5
30 changed files with 1076 additions and 61 deletions

View File

@@ -29,6 +29,7 @@ datafusion.workspace = true
datatypes.workspace = true
geohash = { version = "0.13", optional = true }
h3o = { version = "0.6", optional = true }
jsonb.workspace = true
num = "0.4"
num-traits = "0.2"
once_cell.workspace = true

View File

@@ -22,6 +22,7 @@ use crate::function::{AsyncFunctionRef, FunctionRef};
use crate::scalars::aggregate::{AggregateFunctionMetaRef, AggregateFunctions};
use crate::scalars::date::DateFunction;
use crate::scalars::expression::ExpressionFunction;
use crate::scalars::json::JsonFunction;
use crate::scalars::matches::MatchesFunction;
use crate::scalars::math::MathFunction;
use crate::scalars::numpy::NumpyFunction;
@@ -116,6 +117,9 @@ pub static FUNCTION_REGISTRY: Lazy<Arc<FunctionRegistry>> = Lazy::new(|| {
SystemFunction::register(&function_registry);
TableFunction::register(&function_registry);
// Json related functions
JsonFunction::register(&function_registry);
// Geo functions
#[cfg(feature = "geo")]
crate::scalars::geo::GeoFunctions::register(&function_registry);

View File

@@ -17,9 +17,11 @@ pub(crate) mod date;
pub mod expression;
#[cfg(feature = "geo")]
pub mod geo;
pub mod json;
pub mod matches;
pub mod math;
pub mod numpy;
#[cfg(test)]
pub(crate) mod test;
pub(crate) mod timestamp;

View File

@@ -0,0 +1,31 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
mod json_to_string;
mod to_json;
use json_to_string::JsonToStringFunction;
use to_json::ToJsonFunction;
use crate::function_registry::FunctionRegistry;
pub(crate) struct JsonFunction;
impl JsonFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register(Arc::new(JsonToStringFunction));
registry.register(Arc::new(ToJsonFunction));
}
}

View File

@@ -0,0 +1,174 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{self, Display};
use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
use common_query::prelude::Signature;
use datafusion::logical_expr::Volatility;
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::VectorRef;
use datatypes::scalars::ScalarVectorBuilder;
use datatypes::vectors::{MutableVector, StringVectorBuilder};
use snafu::ensure;
use crate::function::{Function, FunctionContext};
/// Converts the `JSONB` into `String`. It's useful for displaying JSONB content.
#[derive(Clone, Debug, Default)]
pub struct JsonToStringFunction;
const NAME: &str = "json_to_string";
impl Function for JsonToStringFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
Ok(ConcreteDataType::string_datatype())
}
fn signature(&self) -> Signature {
Signature::exact(
vec![ConcreteDataType::json_datatype()],
Volatility::Immutable,
)
}
fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
ensure!(
columns.len() == 1,
InvalidFuncArgsSnafu {
err_msg: format!(
"The length of the args is not correct, expect exactly one, have: {}",
columns.len()
),
}
);
let jsons = &columns[0];
let size = jsons.len();
let datatype = jsons.data_type();
let mut results = StringVectorBuilder::with_capacity(size);
match datatype {
// JSON data type uses binary vector
ConcreteDataType::Binary(_) => {
for i in 0..size {
let json = jsons.get_ref(i);
let json = json.as_binary();
let result = match json {
Ok(Some(json)) => match jsonb::from_slice(json) {
Ok(json) => {
let json = json.to_string();
Some(json)
}
Err(_) => {
return InvalidFuncArgsSnafu {
err_msg: format!("Illegal json binary: {:?}", json),
}
.fail()
}
},
_ => None,
};
results.push(result.as_deref());
}
}
_ => {
return UnsupportedInputDataTypeSnafu {
function: NAME,
datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
}
.fail();
}
}
Ok(results.to_vector())
}
}
impl Display for JsonToStringFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "JSON_TO_STRING")
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use common_query::prelude::TypeSignature;
use datatypes::scalars::ScalarVector;
use datatypes::vectors::BinaryVector;
use super::*;
#[test]
fn test_get_by_path_function() {
let json_to_string = JsonToStringFunction;
assert_eq!("json_to_string", json_to_string.name());
assert_eq!(
ConcreteDataType::string_datatype(),
json_to_string
.return_type(&[ConcreteDataType::json_datatype()])
.unwrap()
);
assert!(matches!(json_to_string.signature(),
Signature {
type_signature: TypeSignature::Exact(valid_types),
volatility: Volatility::Immutable
} if valid_types == vec![ConcreteDataType::json_datatype()]
));
let json_strings = [
r#"{"a": {"b": 2}, "b": 2, "c": 3}"#,
r#"{"a": 4, "b": {"c": 6}, "c": 6}"#,
r#"{"a": 7, "b": 8, "c": {"a": 7}}"#,
];
let jsonbs = json_strings
.iter()
.map(|s| {
let value = jsonb::parse_value(s.as_bytes()).unwrap();
value.to_vec()
})
.collect::<Vec<_>>();
let json_vector = BinaryVector::from_vec(jsonbs);
let args: Vec<VectorRef> = vec![Arc::new(json_vector)];
let vector = json_to_string
.eval(FunctionContext::default(), &args)
.unwrap();
assert_eq!(3, vector.len());
for (i, gt) in json_strings.iter().enumerate() {
let result = vector.get_ref(i);
let result = result.as_string().unwrap().unwrap();
// remove whitespaces
assert_eq!(gt.replace(" ", ""), result);
}
let invalid_jsonb = vec![b"invalid json"];
let invalid_json_vector = BinaryVector::from_vec(invalid_jsonb);
let args: Vec<VectorRef> = vec![Arc::new(invalid_json_vector)];
let vector = json_to_string.eval(FunctionContext::default(), &args);
assert!(vector.is_err());
}
}

View File

@@ -0,0 +1,165 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{self, Display};
use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
use common_query::prelude::Signature;
use datafusion::logical_expr::Volatility;
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::VectorRef;
use datatypes::scalars::ScalarVectorBuilder;
use datatypes::vectors::{BinaryVectorBuilder, MutableVector};
use snafu::ensure;
use crate::function::{Function, FunctionContext};
/// Parses the `String` into `JSONB`.
#[derive(Clone, Debug, Default)]
pub struct ToJsonFunction;
const NAME: &str = "to_json";
impl Function for ToJsonFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
Ok(ConcreteDataType::json_datatype())
}
fn signature(&self) -> Signature {
Signature::exact(
vec![ConcreteDataType::string_datatype()],
Volatility::Immutable,
)
}
fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
ensure!(
columns.len() == 1,
InvalidFuncArgsSnafu {
err_msg: format!(
"The length of the args is not correct, expect exactly one, have: {}",
columns.len()
),
}
);
let json_strings = &columns[0];
let size = json_strings.len();
let datatype = json_strings.data_type();
let mut results = BinaryVectorBuilder::with_capacity(size);
match datatype {
ConcreteDataType::String(_) => {
for i in 0..size {
let json_string = json_strings.get_ref(i);
let json_string = json_string.as_string();
let result = match json_string {
Ok(Some(json_string)) => match jsonb::parse_value(json_string.as_bytes()) {
Ok(json) => Some(json.to_vec()),
Err(_) => {
return InvalidFuncArgsSnafu {
err_msg: format!(
"Cannot convert the string to json, have: {}",
json_string
),
}
.fail()
}
},
_ => None,
};
results.push(result.as_deref());
}
}
_ => {
return UnsupportedInputDataTypeSnafu {
function: NAME,
datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
}
.fail();
}
}
Ok(results.to_vector())
}
}
impl Display for ToJsonFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TO_JSON")
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use common_query::prelude::TypeSignature;
use datatypes::scalars::ScalarVector;
use datatypes::vectors::StringVector;
use super::*;
#[test]
fn test_get_by_path_function() {
let to_json = ToJsonFunction;
assert_eq!("to_json", to_json.name());
assert_eq!(
ConcreteDataType::json_datatype(),
to_json
.return_type(&[ConcreteDataType::json_datatype()])
.unwrap()
);
assert!(matches!(to_json.signature(),
Signature {
type_signature: TypeSignature::Exact(valid_types),
volatility: Volatility::Immutable
} if valid_types == vec![ConcreteDataType::string_datatype()]
));
let json_strings = [
r#"{"a": {"b": 2}, "b": 2, "c": 3}"#,
r#"{"a": 4, "b": {"c": 6}, "c": 6}"#,
r#"{"a": 7, "b": 8, "c": {"a": 7}}"#,
];
let jsonbs = json_strings
.iter()
.map(|s| {
let value = jsonb::parse_value(s.as_bytes()).unwrap();
value.to_vec()
})
.collect::<Vec<_>>();
let json_string_vector = StringVector::from_vec(json_strings.to_vec());
let args: Vec<VectorRef> = vec![Arc::new(json_string_vector)];
let vector = to_json.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(3, vector.len());
for (i, gt) in jsonbs.iter().enumerate() {
let result = vector.get_ref(i);
let result = result.as_binary().unwrap().unwrap();
// remove whitespaces
assert_eq!(gt, result);
}
}
}