feat: add json data type (#4619)

* feat: add json type and vector

* fix: allow to create and insert json data

* feat: udf to query json as string

* refactor: remove JsonbValue and JsonVector

* feat: show json value as strings

* chore: make ci happy

* test: adunit test and sqlness test

* refactor: use binary as grpc value of json

* fix: use non-preserve-order jsonb

* test: revert changed test

* refactor: change udf get_by_path to jq

* chore: make ci happy

* fix: distinguish binary and json in proto

* chore: delete udf for future pr

* refactor: remove Value(Json)

* chore: follow review comments

* test: some tests and checks

* test: fix unit tests

* chore: follow review comments

* chore: corresponding changes to proto

* fix: change grpc and pgsql server behavior alongside with sqlness/crud tests

* chore: follow review comments

* feat: udf of conversions between json and strings, used for grpc server

* refactor: rename to_string to json_to_string

* test: add more sqlness test for json

* chore: thanks for review :)

* Apply suggestions from code review

---------

Co-authored-by: Weny Xu <wenymedia@gmail.com>
This commit is contained in:
Yohan Wal
2024-09-09 19:41:36 +08:00
committed by GitHub
parent dc89944570
commit 04e7dd6fd5
30 changed files with 1076 additions and 61 deletions

View File

@@ -25,6 +25,7 @@ common-time.workspace = true
datafusion-common.workspace = true
enum_dispatch = "0.3"
greptime-proto.workspace = true
jsonb.workspace = true
num = "0.4"
num-traits = "0.2"
ordered-float = { version = "3.0", features = ["serde"] }

View File

@@ -33,8 +33,8 @@ use crate::types::{
BinaryType, BooleanType, DateTimeType, DateType, Decimal128Type, DictionaryType,
DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType, DurationSecondType,
DurationType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, ListType,
NullType, StringType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonType,
ListType, NullType, StringType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
@@ -81,6 +81,9 @@ pub enum ConcreteDataType {
// Compound types:
List(ListType),
Dictionary(DictionaryType),
// JSON type:
Json(JsonType),
}
impl fmt::Display for ConcreteDataType {
@@ -128,6 +131,7 @@ impl fmt::Display for ConcreteDataType {
ConcreteDataType::Decimal128(v) => write!(f, "{}", v.name()),
ConcreteDataType::List(v) => write!(f, "{}", v.name()),
ConcreteDataType::Dictionary(v) => write!(f, "{}", v.name()),
ConcreteDataType::Json(v) => write!(f, "{}", v.name()),
}
}
}
@@ -162,6 +166,7 @@ impl ConcreteDataType {
| ConcreteDataType::Duration(_)
| ConcreteDataType::Decimal128(_)
| ConcreteDataType::Binary(_)
| ConcreteDataType::Json(_)
)
}
@@ -216,6 +221,10 @@ impl ConcreteDataType {
matches!(self, ConcreteDataType::Decimal128(_))
}
pub fn is_json(&self) -> bool {
matches!(self, ConcreteDataType::Json(_))
}
pub fn numerics() -> Vec<ConcreteDataType> {
vec![
ConcreteDataType::int8_datatype(),
@@ -404,7 +413,7 @@ macro_rules! impl_new_concrete_type_functions {
impl_new_concrete_type_functions!(
Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64,
Binary, Date, DateTime, String
Binary, Date, DateTime, String, Json
);
impl ConcreteDataType {

View File

@@ -25,6 +25,7 @@ use datafusion_common::DFSchemaRef;
use snafu::{ensure, ResultExt};
use crate::error::{self, DuplicateColumnSnafu, Error, ProjectArrowSchemaSnafu, Result};
use crate::prelude::DataType;
pub use crate::schema::column_schema::{
ColumnSchema, FulltextAnalyzer, FulltextOptions, Metadata, COMMENT_KEY, FULLTEXT_KEY,
TIME_INDEX_KEY,
@@ -34,6 +35,8 @@ pub use crate::schema::raw::RawSchema;
/// Key used to store version number of the schema in metadata.
pub const VERSION_KEY: &str = "greptime:version";
/// Key used to store actual column type in field metadata.
pub const TYPE_KEY: &str = "greptime:type";
/// A common schema, should be immutable.
#[derive(Clone, PartialEq, Eq)]
@@ -256,7 +259,13 @@ fn collect_fields(column_schemas: &[ColumnSchema]) -> Result<FieldsAndIndices> {
if column_schema.is_time_index() && timestamp_index.is_none() {
timestamp_index = Some(index);
}
let field = Field::try_from(column_schema)?;
let mut field = Field::try_from(column_schema)?;
// Json column performs the same as binary column in Arrow, so we need to mark it
if column_schema.data_type.is_json() {
let metadata = HashMap::from([(TYPE_KEY.to_string(), column_schema.data_type.name())]);
field = field.with_metadata(metadata);
}
fields.push(field);
ensure!(
name_to_index

View File

@@ -22,6 +22,8 @@ use snafu::{ensure, ResultExt};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Error, Result};
use crate::schema::constraint::ColumnDefaultConstraint;
use crate::schema::TYPE_KEY;
use crate::types::JSON_TYPE_NAME;
use crate::value::Value;
use crate::vectors::VectorRef;
@@ -268,7 +270,14 @@ impl TryFrom<&Field> for ColumnSchema {
type Error = Error;
fn try_from(field: &Field) -> Result<ColumnSchema> {
let data_type = ConcreteDataType::try_from(field.data_type())?;
let mut data_type = ConcreteDataType::try_from(field.data_type())?;
// Override the data type if it is specified in the metadata.
if field.metadata().contains_key(TYPE_KEY) {
data_type = match field.metadata().get(TYPE_KEY).unwrap().as_str() {
JSON_TYPE_NAME => ConcreteDataType::json_datatype(),
_ => data_type,
};
}
let mut metadata = field.metadata().clone();
let default_constraint = match metadata.remove(DEFAULT_CONSTRAINT_KEY) {
Some(json) => {
@@ -528,4 +537,32 @@ mod tests {
assert_eq!(formatted_int8, "test_column_1 Int8 null");
assert_eq!(formatted_int32, "test_column_2 Int32 not null");
}
#[test]
fn test_from_field_to_column_schema() {
let field = Field::new("test", ArrowDataType::Int32, true);
let column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!("test", column_schema.name);
assert_eq!(ConcreteDataType::int32_datatype(), column_schema.data_type);
assert!(column_schema.is_nullable);
assert!(!column_schema.is_time_index);
assert!(column_schema.default_constraint.is_none());
assert!(column_schema.metadata.is_empty());
let field = Field::new("test", ArrowDataType::Binary, true);
let field = field.with_metadata(Metadata::from([(
TYPE_KEY.to_string(),
ConcreteDataType::json_datatype().name(),
)]));
let column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!("test", column_schema.name);
assert_eq!(ConcreteDataType::json_datatype(), column_schema.data_type);
assert!(column_schema.is_nullable);
assert!(!column_schema.is_time_index);
assert!(column_schema.default_constraint.is_none());
assert_eq!(
column_schema.metadata.get(TYPE_KEY).unwrap(),
&ConcreteDataType::json_datatype().name()
);
}
}

View File

@@ -68,6 +68,8 @@ pub enum LogicalTypeId {
List,
Dictionary,
Json,
}
impl LogicalTypeId {
@@ -126,6 +128,7 @@ impl LogicalTypeId {
LogicalTypeId::DurationMicrosecond => ConcreteDataType::duration_microsecond_datatype(),
LogicalTypeId::DurationNanosecond => ConcreteDataType::duration_nanosecond_datatype(),
LogicalTypeId::Decimal128 => ConcreteDataType::decimal128_default_datatype(),
LogicalTypeId::Json => ConcreteDataType::json_datatype(),
}
}
}

View File

@@ -21,6 +21,7 @@ mod decimal_type;
mod dictionary_type;
mod duration_type;
mod interval_type;
mod json_type;
mod list_type;
mod null_type;
mod primitive_type;
@@ -42,6 +43,7 @@ pub use duration_type::{
pub use interval_type::{
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType,
};
pub use json_type::{JsonType, JSON_TYPE_NAME};
pub use list_type::ListType;
pub use null_type::NullType;
pub use primitive_type::{

View File

@@ -0,0 +1,67 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::Bytes;
use serde::{Deserialize, Serialize};
use crate::data_type::{DataType, DataTypeRef};
use crate::scalars::ScalarVectorBuilder;
use crate::type_id::LogicalTypeId;
use crate::value::Value;
use crate::vectors::{BinaryVectorBuilder, MutableVector};
pub const JSON_TYPE_NAME: &str = "Json";
/// JsonType is a data type for JSON data. It is stored as binary data of jsonb format.
/// It utilizes current binary value and vector implementation.
#[derive(Debug, Default, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
pub struct JsonType;
impl JsonType {
pub fn arc() -> DataTypeRef {
Arc::new(Self)
}
}
impl DataType for JsonType {
fn name(&self) -> String {
JSON_TYPE_NAME.to_string()
}
fn logical_type_id(&self) -> LogicalTypeId {
LogicalTypeId::Json
}
fn default_value(&self) -> Value {
Bytes::default().into()
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Binary
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(BinaryVectorBuilder::with_capacity(capacity))
}
fn try_cast(&self, from: Value) -> Option<Value> {
match from {
Value::Binary(v) => Some(Value::Binary(v)),
_ => None,
}
}
}

View File

@@ -342,7 +342,8 @@ impl Value {
let value_type_id = self.logical_type_id();
let output_type_id = output_type.logical_type_id();
ensure!(
output_type_id == value_type_id || self.is_null(),
// Json type leverage Value(Binary) for storage.
output_type_id == value_type_id || self.is_null() || (output_type_id == LogicalTypeId::Json && value_type_id == LogicalTypeId::Binary),
error::ToScalarValueSnafu {
reason: format!(
"expect value to return output_type {output_type_id:?}, actual: {value_type_id:?}",
@@ -484,7 +485,7 @@ pub fn to_null_scalar_value(output_type: &ConcreteDataType) -> Result<ScalarValu
ConcreteDataType::UInt64(_) => ScalarValue::UInt64(None),
ConcreteDataType::Float32(_) => ScalarValue::Float32(None),
ConcreteDataType::Float64(_) => ScalarValue::Float64(None),
ConcreteDataType::Binary(_) => ScalarValue::Binary(None),
ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) => ScalarValue::Binary(None),
ConcreteDataType::String(_) => ScalarValue::Utf8(None),
ConcreteDataType::Date(_) => ScalarValue::Date32(None),
ConcreteDataType::DateTime(_) => ScalarValue::Date64(None),
@@ -1994,6 +1995,10 @@ mod tests {
&ConcreteDataType::duration_nanosecond_datatype(),
&Value::Duration(Duration::new_nanosecond(1)),
);
check_type_and_value(
&ConcreteDataType::decimal128_datatype(38, 10),
&Value::Decimal128(Decimal128::new(1, 38, 10)),
);
}
#[test]
@@ -2178,6 +2183,14 @@ mod tests {
ValueRef::List(ListValueRef::Ref { val: &list }),
Value::List(list.clone()).as_value_ref()
);
let jsonb_value = jsonb::parse_value(r#"{"key": "value"}"#.as_bytes())
.unwrap()
.to_vec();
assert_eq!(
ValueRef::Binary(jsonb_value.clone().as_slice()),
Value::Binary(jsonb_value.into()).as_value_ref()
);
}
#[test]
@@ -2391,6 +2404,16 @@ mod tests {
.try_to_scalar_value(&ConcreteDataType::binary_datatype())
.unwrap()
);
let jsonb_value = jsonb::parse_value(r#"{"key": "value"}"#.as_bytes())
.unwrap()
.to_vec();
assert_eq!(
ScalarValue::Binary(Some(jsonb_value.clone())),
Value::Binary(jsonb_value.into())
.try_to_scalar_value(&ConcreteDataType::json_datatype())
.unwrap()
);
}
#[test]
@@ -2523,6 +2546,12 @@ mod tests {
.try_to_scalar_value(&ConcreteDataType::duration_nanosecond_datatype())
.unwrap()
);
assert_eq!(
ScalarValue::Binary(None),
Value::Null
.try_to_scalar_value(&ConcreteDataType::json_datatype())
.unwrap()
);
}
#[test]

View File

@@ -80,7 +80,7 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool {
match lhs.data_type() {
Null(_) => true,
Boolean(_) => is_vector_eq!(BooleanVector, lhs, rhs),
Binary(_) => is_vector_eq!(BinaryVector, lhs, rhs),
Binary(_) | Json(_) => is_vector_eq!(BinaryVector, lhs, rhs),
String(_) => is_vector_eq!(StringVector, lhs, rhs),
Date(_) => is_vector_eq!(DateVector, lhs, rhs),
DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs),