mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-14 12:00:40 +00:00
json2 type
insert flush query-driven and data-driven concretize select compaction Signed-off-by: luofucong <luofc@foxmail.com>
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -7990,6 +7990,7 @@ version = "1.0.0-rc.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"aquamarine",
|
||||
"arrow-schema 57.3.0",
|
||||
"async-channel 1.9.0",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
|
||||
@@ -129,7 +129,7 @@ impl From<ColumnDataTypeWrapper> for ConcreteDataType {
|
||||
};
|
||||
ConcreteDataType::json_native_datatype(inner_type.into())
|
||||
}
|
||||
None => ConcreteDataType::Json(JsonType::null()),
|
||||
None => ConcreteDataType::Json(JsonType::new(JsonFormat::Json2)),
|
||||
_ => {
|
||||
// invalid state, type extension is missing or invalid
|
||||
ConcreteDataType::null_datatype()
|
||||
@@ -461,6 +461,7 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
|
||||
})
|
||||
}
|
||||
}
|
||||
JsonFormat::Json2 => Some(ColumnDataTypeExtension { type_ext: None }),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
|
||||
@@ -248,6 +248,7 @@ impl ObjbenchCommand {
|
||||
op_type: OperationType::Flush,
|
||||
metadata: region_meta,
|
||||
source: FlatSource::Stream(reader_stream),
|
||||
schema: None,
|
||||
cache_manager,
|
||||
storage: None,
|
||||
max_sequence: None,
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod json2_get;
|
||||
mod json2_get_rewriter;
|
||||
pub mod json_get;
|
||||
mod json_get_rewriter;
|
||||
mod json_is;
|
||||
@@ -26,6 +28,8 @@ use json_is::{
|
||||
JsonIsArray, JsonIsBool, JsonIsFloat, JsonIsInt, JsonIsNull, JsonIsObject, JsonIsString,
|
||||
};
|
||||
use json_to_string::JsonToStringFunction;
|
||||
use json2_get::Json2GetFunction;
|
||||
use json2_get_rewriter::Json2GetRewriter;
|
||||
use parse_json::ParseJsonFunction;
|
||||
|
||||
use crate::function_registry::FunctionRegistry;
|
||||
@@ -44,6 +48,7 @@ impl JsonFunction {
|
||||
registry.register_scalar(JsonGetBool::default());
|
||||
registry.register_scalar(JsonGetObject::default());
|
||||
registry.register_scalar(JsonGetWithType::default());
|
||||
registry.register_scalar(Json2GetFunction::default());
|
||||
|
||||
registry.register_scalar(JsonIsNull::default());
|
||||
registry.register_scalar(JsonIsInt::default());
|
||||
@@ -57,5 +62,6 @@ impl JsonFunction {
|
||||
registry.register_scalar(json_path_match::JsonPathMatchFunction::default());
|
||||
|
||||
registry.register_function_rewrite(JsonGetRewriter);
|
||||
registry.register_function_rewrite(Json2GetRewriter);
|
||||
}
|
||||
}
|
||||
|
||||
145
src/common/function/src/scalars/json/json2_get.rs
Normal file
145
src/common/function/src/scalars/json/json2_get.rs
Normal file
@@ -0,0 +1,145 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::StringViewBuilder;
|
||||
use arrow_cast::display::ArrayFormatter;
|
||||
use datafusion_common::arrow::array::{Array, ArrayRef, StructArray, new_null_array};
|
||||
use datafusion_common::arrow::datatypes::{DataType, Field};
|
||||
use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
|
||||
use datafusion_expr::{
|
||||
ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, Signature, Volatility,
|
||||
};
|
||||
use derive_more::Display;
|
||||
|
||||
use crate::function::Function;
|
||||
|
||||
#[derive(Display, Debug)]
|
||||
#[display("{}", Self::NAME.to_ascii_uppercase())]
|
||||
pub struct Json2GetFunction {
|
||||
signature: Signature,
|
||||
}
|
||||
|
||||
impl Json2GetFunction {
|
||||
pub const NAME: &'static str = "json2_get";
|
||||
}
|
||||
|
||||
impl Function for Json2GetFunction {
|
||||
fn name(&self) -> &str {
|
||||
Self::NAME
|
||||
}
|
||||
|
||||
fn return_type(&self, _: &[DataType]) -> Result<DataType> {
|
||||
internal_err!("this method isn't meant to be called")
|
||||
}
|
||||
|
||||
fn signature(&self) -> &Signature {
|
||||
&self.signature
|
||||
}
|
||||
|
||||
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
|
||||
if args.args.len() != 3 {
|
||||
return exec_err!("json2_get expects 3 arguments, got {}", args.args.len());
|
||||
}
|
||||
|
||||
let input = args.args[0].to_array(args.number_rows)?;
|
||||
let path = path_from_arg(&args.args[1])?;
|
||||
let return_type = args.return_field.data_type();
|
||||
|
||||
let segments = path.split('.').collect::<Vec<_>>();
|
||||
let Some(leaf) = resolve_leaf_path(&input, &segments) else {
|
||||
return Ok(ColumnarValue::Array(new_null_array(
|
||||
return_type,
|
||||
input.len(),
|
||||
)));
|
||||
};
|
||||
|
||||
let casted = if arrow_cast::can_cast_types(leaf.data_type(), return_type) {
|
||||
arrow_cast::cast(leaf.as_ref(), return_type)?
|
||||
} else if return_type.is_string() {
|
||||
cast_array_to_string(&leaf)?
|
||||
} else {
|
||||
return Ok(ColumnarValue::Array(new_null_array(
|
||||
return_type,
|
||||
input.len(),
|
||||
)));
|
||||
};
|
||||
|
||||
Ok(ColumnarValue::Array(casted))
|
||||
}
|
||||
|
||||
fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<Arc<Field>> {
|
||||
let Some(Some(value)) = args.scalar_arguments.get(2) else {
|
||||
return internal_err!(
|
||||
"third argument of function {} must be present and is scalar",
|
||||
self.name()
|
||||
);
|
||||
};
|
||||
Ok(Arc::new(Field::new(
|
||||
"json2_get expected type",
|
||||
value.data_type(),
|
||||
true,
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Json2GetFunction {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
signature: Signature::any(3, Volatility::Immutable),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn path_from_arg(arg: &ColumnarValue) -> Result<&String> {
|
||||
match arg {
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(path)))
|
||||
| ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(path)))
|
||||
| ColumnarValue::Scalar(ScalarValue::Utf8View(Some(path))) => Ok(path),
|
||||
ColumnarValue::Scalar(_) => exec_err!("json2_get expects a string path"),
|
||||
ColumnarValue::Array(_) => exec_err!("json2_get expects a literal path"),
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_leaf_path(array: &ArrayRef, segments: &[&str]) -> Option<ArrayRef> {
|
||||
if segments.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut current = array.clone();
|
||||
for segment in segments {
|
||||
let struct_array = current.as_any().downcast_ref::<StructArray>()?;
|
||||
let DataType::Struct(fields) = current.data_type() else {
|
||||
unreachable!()
|
||||
};
|
||||
let index = fields.iter().position(|field| field.name() == *segment)?;
|
||||
current = struct_array.column(index).clone();
|
||||
}
|
||||
Some(current)
|
||||
}
|
||||
|
||||
fn cast_array_to_string(array: &ArrayRef) -> Result<ArrayRef> {
|
||||
let mut builder = StringViewBuilder::with_capacity(array.len());
|
||||
let formatter = ArrayFormatter::try_new(array, &Default::default())?;
|
||||
for i in 0..array.len() {
|
||||
let value = array.is_valid(i).then(|| formatter.value(i).to_string());
|
||||
builder.append_option(value);
|
||||
}
|
||||
Ok(Arc::new(builder.finish()))
|
||||
}
|
||||
|
||||
pub fn datatype_expr(data_type: &DataType) -> Result<Expr> {
|
||||
ScalarValue::try_new_null(data_type).map(|x| Expr::Literal(x, None))
|
||||
}
|
||||
82
src/common/function/src/scalars/json/json2_get_rewriter.rs
Normal file
82
src/common/function/src/scalars/json/json2_get_rewriter.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use arrow_schema::DataType;
|
||||
use datafusion::common::config::ConfigOptions;
|
||||
use datafusion::common::tree_node::Transformed;
|
||||
use datafusion::common::{DFSchema, Result};
|
||||
use datafusion::logical_expr::expr_rewriter::FunctionRewrite;
|
||||
use datafusion::scalar::ScalarValue;
|
||||
use datafusion_common::{exec_err, internal_err};
|
||||
use datafusion_expr::Expr;
|
||||
|
||||
use crate::scalars::json::json2_get::{Json2GetFunction, datatype_expr};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Json2GetRewriter;
|
||||
|
||||
impl FunctionRewrite for Json2GetRewriter {
|
||||
fn name(&self) -> &'static str {
|
||||
"Json2GetRewriter"
|
||||
}
|
||||
|
||||
fn rewrite(
|
||||
&self,
|
||||
expr: Expr,
|
||||
_schema: &DFSchema,
|
||||
_config: &ConfigOptions,
|
||||
) -> Result<Transformed<Expr>> {
|
||||
let (expr, rewritten) = reduce_arrow_cast(expr)?;
|
||||
if rewritten {
|
||||
Ok(Transformed::yes(expr))
|
||||
} else {
|
||||
Ok(Transformed::no(expr))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// arrow_cast(json2_get(_, _, _), "<type>") => json2_get(_, _, "<type>")
|
||||
fn reduce_arrow_cast(expr: Expr) -> Result<(Expr, bool)> {
|
||||
let mut f = match expr {
|
||||
Expr::ScalarFunction(f) => f,
|
||||
expr => return Ok((expr, false)),
|
||||
};
|
||||
if f.name() != "arrow_cast" {
|
||||
return Ok((Expr::ScalarFunction(f), false));
|
||||
}
|
||||
if !matches!(&f.args[0], Expr::ScalarFunction(f) if f.name() == Json2GetFunction::NAME) {
|
||||
return Ok((Expr::ScalarFunction(f), false));
|
||||
}
|
||||
|
||||
if f.args.len() != 2 {
|
||||
return internal_err!("arrow_cast must have 2 arguments");
|
||||
}
|
||||
let target_type = match &f.args[1] {
|
||||
Expr::Literal(ScalarValue::Utf8(Some(target_type)), _) => target_type
|
||||
.parse::<DataType>()
|
||||
.map_err(Into::into)
|
||||
.and_then(|x| datatype_expr(&x))?,
|
||||
x => return exec_err!("arrow_cast expects 2nd argument a string, got: {:?}", x),
|
||||
};
|
||||
|
||||
let Expr::ScalarFunction(mut json2_get) = f.args.remove(0) else {
|
||||
// checked in above "matches!"
|
||||
unreachable!()
|
||||
};
|
||||
if json2_get.args.len() != 3 {
|
||||
return internal_err!("function {} must have 3 arguments", Json2GetFunction::NAME);
|
||||
}
|
||||
json2_get.args[2] = target_type;
|
||||
Ok((Expr::ScalarFunction(json2_get), true))
|
||||
}
|
||||
@@ -188,13 +188,6 @@ pub enum Error {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to align JSON array, reason: {reason}"))]
|
||||
AlignJsonArray {
|
||||
reason: String,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
}
|
||||
|
||||
impl ErrorExt for Error {
|
||||
@@ -210,8 +203,7 @@ impl ErrorExt for Error {
|
||||
| Error::ToArrowScalar { .. }
|
||||
| Error::ProjectArrowRecordBatch { .. }
|
||||
| Error::PhysicalExpr { .. }
|
||||
| Error::RecordBatchSliceIndexOverflow { .. }
|
||||
| Error::AlignJsonArray { .. } => StatusCode::Internal,
|
||||
| Error::RecordBatchSliceIndexOverflow { .. } => StatusCode::Internal,
|
||||
|
||||
Error::PollStream { .. } => StatusCode::EngineExecuteQuery,
|
||||
|
||||
|
||||
@@ -20,10 +20,11 @@ use datafusion::arrow::util::pretty::pretty_format_batches;
|
||||
use datafusion_common::arrow::array::ArrayRef;
|
||||
use datafusion_common::arrow::compute;
|
||||
use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef as ArrowSchemaRef};
|
||||
use datatypes::arrow::array::{Array, AsArray, RecordBatchOptions, StructArray, new_null_array};
|
||||
use datatypes::arrow::array::{Array, AsArray, RecordBatchOptions};
|
||||
use datatypes::extension::json::is_json_extension_type;
|
||||
use datatypes::prelude::DataType;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use datatypes::vectors::json::array::JsonArray;
|
||||
use datatypes::vectors::{Helper, VectorRef};
|
||||
use serde::ser::{Error, SerializeStruct};
|
||||
use serde::{Serialize, Serializer};
|
||||
@@ -31,8 +32,8 @@ use snafu::{OptionExt, ResultExt, ensure};
|
||||
|
||||
use crate::DfRecordBatch;
|
||||
use crate::error::{
|
||||
self, AlignJsonArraySnafu, ArrowComputeSnafu, ColumnNotExistsSnafu, DataTypesSnafu,
|
||||
NewDfRecordBatchSnafu, ProjectArrowRecordBatchSnafu, Result,
|
||||
self, ArrowComputeSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
|
||||
Result,
|
||||
};
|
||||
|
||||
/// A two-dimensional batch of column-oriented data with a defined schema.
|
||||
@@ -354,81 +355,7 @@ pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Resul
|
||||
Ok(RecordBatch::from_df_record_batch(schema, record_batch))
|
||||
}
|
||||
|
||||
/// Align a json array `json_array` to the json type `schema_type`. The `schema_type` is often the
|
||||
/// "largest" json type after some insertions in the table schema, while the json array previously
|
||||
/// written in the SST could be lagged behind it. So it's important to "amend" the json array's
|
||||
/// missing fields with null arrays, to align the array's data type with the provided one.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - The json array is not an Arrow [StructArray], or the provided data type `schema_type` is not
|
||||
/// of Struct type. Both of which shouldn't happen unless we switch our implementation of how
|
||||
/// json array is physically stored.
|
||||
pub fn align_json_array(json_array: &ArrayRef, schema_type: &ArrowDataType) -> Result<ArrayRef> {
|
||||
let json_type = json_array.data_type();
|
||||
if json_type == schema_type {
|
||||
return Ok(json_array.clone());
|
||||
}
|
||||
|
||||
let json_array = json_array.as_struct();
|
||||
let array_fields = json_array.fields();
|
||||
let array_columns = json_array.columns();
|
||||
let ArrowDataType::Struct(schema_fields) = schema_type else {
|
||||
unreachable!()
|
||||
};
|
||||
let mut aligned = Vec::with_capacity(schema_fields.len());
|
||||
|
||||
// Compare the fields in the json array and the to-be-aligned schema, amending with null arrays
|
||||
// on the way. It's very important to note that fields in the json array and in the json type
|
||||
// are both SORTED.
|
||||
|
||||
let mut i = 0; // point to the schema fields
|
||||
let mut j = 0; // point to the array fields
|
||||
while i < schema_fields.len() && j < array_fields.len() {
|
||||
let schema_field = &schema_fields[i];
|
||||
let array_field = &array_fields[j];
|
||||
if schema_field.name() == array_field.name() {
|
||||
if matches!(schema_field.data_type(), ArrowDataType::Struct(_)) {
|
||||
// A `StructArray`s in a json array must be another json array. (Like a nested json
|
||||
// object in a json value.)
|
||||
aligned.push(align_json_array(
|
||||
&array_columns[j],
|
||||
schema_field.data_type(),
|
||||
)?);
|
||||
} else {
|
||||
aligned.push(array_columns[j].clone());
|
||||
}
|
||||
j += 1;
|
||||
} else {
|
||||
aligned.push(new_null_array(schema_field.data_type(), json_array.len()));
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
if i < schema_fields.len() {
|
||||
for field in &schema_fields[i..] {
|
||||
aligned.push(new_null_array(field.data_type(), json_array.len()));
|
||||
}
|
||||
}
|
||||
ensure!(
|
||||
j == array_fields.len(),
|
||||
AlignJsonArraySnafu {
|
||||
reason: format!(
|
||||
"this json array has more fields {:?}",
|
||||
array_fields[j..]
|
||||
.iter()
|
||||
.map(|x| x.name())
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
}
|
||||
);
|
||||
|
||||
let json_array =
|
||||
StructArray::try_new(schema_fields.clone(), aligned, json_array.nulls().cloned())
|
||||
.context(NewDfRecordBatchSnafu)?;
|
||||
Ok(Arc::new(json_array))
|
||||
}
|
||||
|
||||
fn maybe_align_json_array_with_schema(
|
||||
pub fn maybe_align_json_array_with_schema(
|
||||
schema: &ArrowSchemaRef,
|
||||
arrays: Vec<ArrayRef>,
|
||||
) -> Result<Vec<ArrayRef>> {
|
||||
@@ -443,7 +370,9 @@ fn maybe_align_json_array_with_schema(
|
||||
continue;
|
||||
}
|
||||
|
||||
let json_array = align_json_array(&array, field.data_type())?;
|
||||
let json_array = JsonArray::from(&array)
|
||||
.try_align(field.data_type())
|
||||
.context(DataTypesSnafu)?;
|
||||
aligned.push(json_array);
|
||||
}
|
||||
Ok(aligned)
|
||||
@@ -453,12 +382,8 @@ fn maybe_align_json_array_with_schema(
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::arrow::array::{
|
||||
AsArray, BooleanArray, Float64Array, Int64Array, ListArray, UInt32Array,
|
||||
};
|
||||
use datatypes::arrow::datatypes::{
|
||||
DataType, Field, Fields, Int64Type, Schema as ArrowSchema, UInt32Type,
|
||||
};
|
||||
use datatypes::arrow::array::{AsArray, UInt32Array};
|
||||
use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, UInt32Type};
|
||||
use datatypes::arrow_array::StringArray;
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnSchema, Schema};
|
||||
@@ -466,165 +391,6 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_align_json_array() -> Result<()> {
|
||||
struct TestCase {
|
||||
json_array: ArrayRef,
|
||||
schema_type: DataType,
|
||||
expected: std::result::Result<ArrayRef, String>,
|
||||
}
|
||||
|
||||
impl TestCase {
|
||||
fn new(
|
||||
json_array: StructArray,
|
||||
schema_type: Fields,
|
||||
expected: std::result::Result<Vec<ArrayRef>, String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
json_array: Arc::new(json_array),
|
||||
schema_type: DataType::Struct(schema_type.clone()),
|
||||
expected: expected
|
||||
.map(|x| Arc::new(StructArray::new(schema_type, x, None)) as ArrayRef),
|
||||
}
|
||||
}
|
||||
|
||||
fn test(self) -> Result<()> {
|
||||
let result = align_json_array(&self.json_array, &self.schema_type);
|
||||
match (result, self.expected) {
|
||||
(Ok(json_array), Ok(expected)) => assert_eq!(&json_array, &expected),
|
||||
(Ok(json_array), Err(e)) => {
|
||||
panic!("expecting error {e} but actually get: {json_array:?}")
|
||||
}
|
||||
(Err(e), Err(expected)) => assert_eq!(e.to_string(), expected),
|
||||
(Err(e), Ok(_)) => return Err(e),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Test empty json array can be aligned with a complex json type.
|
||||
TestCase::new(
|
||||
StructArray::new_empty_fields(2, None),
|
||||
Fields::from(vec![
|
||||
Field::new("int", DataType::Int64, true),
|
||||
Field::new_struct(
|
||||
"nested",
|
||||
vec![Field::new("bool", DataType::Boolean, true)],
|
||||
true,
|
||||
),
|
||||
Field::new("string", DataType::Utf8, true),
|
||||
]),
|
||||
Ok(vec![
|
||||
Arc::new(Int64Array::new_null(2)) as ArrayRef,
|
||||
Arc::new(StructArray::new_null(
|
||||
Fields::from(vec![Arc::new(Field::new("bool", DataType::Boolean, true))]),
|
||||
2,
|
||||
)),
|
||||
Arc::new(StringArray::new_null(2)),
|
||||
]),
|
||||
)
|
||||
.test()?;
|
||||
|
||||
// Test simple json array alignment.
|
||||
TestCase::new(
|
||||
StructArray::from(vec![(
|
||||
Arc::new(Field::new("float", DataType::Float64, true)),
|
||||
Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef,
|
||||
)]),
|
||||
Fields::from(vec![
|
||||
Field::new("float", DataType::Float64, true),
|
||||
Field::new("string", DataType::Utf8, true),
|
||||
]),
|
||||
Ok(vec![
|
||||
Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef,
|
||||
Arc::new(StringArray::new_null(3)),
|
||||
]),
|
||||
)
|
||||
.test()?;
|
||||
|
||||
// Test complex json array alignment.
|
||||
TestCase::new(
|
||||
StructArray::from(vec![
|
||||
(
|
||||
Arc::new(Field::new_list(
|
||||
"list",
|
||||
Field::new_list_field(DataType::Int64, true),
|
||||
true,
|
||||
)),
|
||||
Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
|
||||
Some(vec![Some(1)]),
|
||||
None,
|
||||
Some(vec![Some(2), Some(3)]),
|
||||
])) as ArrayRef,
|
||||
),
|
||||
(
|
||||
Arc::new(Field::new_struct(
|
||||
"nested",
|
||||
vec![Field::new("int", DataType::Int64, true)],
|
||||
true,
|
||||
)),
|
||||
Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("int", DataType::Int64, true)),
|
||||
Arc::new(Int64Array::from(vec![-1, -2, -3])) as ArrayRef,
|
||||
)])),
|
||||
),
|
||||
(
|
||||
Arc::new(Field::new("string", DataType::Utf8, true)),
|
||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
||||
),
|
||||
]),
|
||||
Fields::from(vec![
|
||||
Field::new("bool", DataType::Boolean, true),
|
||||
Field::new_list("list", Field::new_list_field(DataType::Int64, true), true),
|
||||
Field::new_struct(
|
||||
"nested",
|
||||
vec![
|
||||
Field::new("float", DataType::Float64, true),
|
||||
Field::new("int", DataType::Int64, true),
|
||||
],
|
||||
true,
|
||||
),
|
||||
Field::new("string", DataType::Utf8, true),
|
||||
]),
|
||||
Ok(vec![
|
||||
Arc::new(BooleanArray::new_null(3)) as ArrayRef,
|
||||
Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
|
||||
Some(vec![Some(1)]),
|
||||
None,
|
||||
Some(vec![Some(2), Some(3)]),
|
||||
])),
|
||||
Arc::new(StructArray::from(vec![
|
||||
(
|
||||
Arc::new(Field::new("float", DataType::Float64, true)),
|
||||
Arc::new(Float64Array::new_null(3)) as ArrayRef,
|
||||
),
|
||||
(
|
||||
Arc::new(Field::new("int", DataType::Int64, true)),
|
||||
Arc::new(Int64Array::from(vec![-1, -2, -3])),
|
||||
),
|
||||
])),
|
||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
||||
]),
|
||||
)
|
||||
.test()?;
|
||||
|
||||
// Test align failed.
|
||||
TestCase::new(
|
||||
StructArray::try_from(vec![
|
||||
("i", Arc::new(Int64Array::from(vec![1])) as ArrayRef),
|
||||
("j", Arc::new(Int64Array::from(vec![2])) as ArrayRef),
|
||||
])
|
||||
.unwrap(),
|
||||
Fields::from(vec![Field::new("i", DataType::Int64, true)]),
|
||||
Err(
|
||||
r#"Failed to align JSON array, reason: this json array has more fields ["j"]"#
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
.test()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_record_batch() {
|
||||
let arrow_schema = Arc::new(ArrowSchema::new(vec![
|
||||
|
||||
@@ -306,7 +306,7 @@ pub(crate) fn parse_string_to_value(
|
||||
let v = parse_string_to_jsonb(&s).context(DatatypeSnafu)?;
|
||||
Ok(Value::Binary(v.into()))
|
||||
}
|
||||
JsonFormat::Native(_) => {
|
||||
JsonFormat::Native(_) | JsonFormat::Json2 => {
|
||||
let extension_type: Option<JsonExtensionType> =
|
||||
column_schema.extension_type().context(DatatypeSnafu)?;
|
||||
let json_structure_settings = extension_type
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#![recursion_limit = "256"]
|
||||
|
||||
pub mod alive_keeper;
|
||||
pub mod config;
|
||||
pub mod datanode;
|
||||
|
||||
@@ -274,6 +274,13 @@ pub enum Error {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to align JSON array, reason: {reason}"))]
|
||||
AlignJsonArray {
|
||||
reason: String,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
}
|
||||
|
||||
impl ErrorExt for Error {
|
||||
@@ -316,7 +323,8 @@ impl ErrorExt for Error {
|
||||
| ConvertScalarToArrowArray { .. }
|
||||
| ParseExtendedType { .. }
|
||||
| InconsistentStructFieldsAndItems { .. }
|
||||
| ArrowMetadata { .. } => StatusCode::Internal,
|
||||
| ArrowMetadata { .. }
|
||||
| AlignJsonArray { .. } => StatusCode::Internal,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
//! The struct will carry all the fields of the Json object. We will not flatten any json object in this implementation.
|
||||
//!
|
||||
|
||||
pub mod requirement;
|
||||
pub mod value;
|
||||
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
|
||||
77
src/datatypes/src/json/requirement.rs
Normal file
77
src/datatypes/src/json/requirement.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::data_type::ConcreteDataType;
|
||||
use crate::types::{StructField, StructType};
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||
pub struct JsonPathTarget {
|
||||
root: JsonPathTargetNode,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||
struct JsonPathTargetNode {
|
||||
children: BTreeMap<String, JsonPathTargetNode>,
|
||||
leaf_type: Option<ConcreteDataType>,
|
||||
}
|
||||
|
||||
impl JsonPathTarget {
|
||||
pub fn require_typed_path(&mut self, path: &str, data_type: ConcreteDataType) {
|
||||
let mut current = &mut self.root;
|
||||
for segment in path.split('.') {
|
||||
current = current.children.entry(segment.to_string()).or_default();
|
||||
}
|
||||
current.require_leaf_type(data_type);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.root.children.is_empty()
|
||||
}
|
||||
|
||||
pub fn build_type(&self) -> Option<ConcreteDataType> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(ConcreteDataType::Struct(self.root.build_struct_type()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl JsonPathTargetNode {
|
||||
fn require_leaf_type(&mut self, data_type: ConcreteDataType) {
|
||||
self.leaf_type = Some(data_type);
|
||||
}
|
||||
|
||||
fn build_data_type(&self) -> ConcreteDataType {
|
||||
if self.children.is_empty() {
|
||||
self.leaf_type
|
||||
.clone()
|
||||
.unwrap_or_else(ConcreteDataType::string_datatype)
|
||||
} else {
|
||||
ConcreteDataType::Struct(self.build_struct_type())
|
||||
}
|
||||
}
|
||||
|
||||
fn build_struct_type(&self) -> StructType {
|
||||
let fields = self
|
||||
.children
|
||||
.iter()
|
||||
.map(|(name, child)| StructField::new(name.clone(), child.build_data_type(), true))
|
||||
.collect::<Vec<_>>();
|
||||
StructType::new(Arc::new(fields))
|
||||
}
|
||||
}
|
||||
@@ -160,12 +160,18 @@ impl JsonVariant {
|
||||
};
|
||||
JsonNativeType::Array(Box::new(item_type))
|
||||
}
|
||||
JsonVariant::Object(object) => JsonNativeType::Object(
|
||||
object
|
||||
.iter()
|
||||
.map(|(k, v)| (k.clone(), v.native_type()))
|
||||
.collect(),
|
||||
),
|
||||
JsonVariant::Object(object) => {
|
||||
if object.is_empty() {
|
||||
JsonNativeType::Null
|
||||
} else {
|
||||
JsonNativeType::Object(
|
||||
object
|
||||
.iter()
|
||||
.map(|(k, v)| (k.clone(), v.native_type()))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -518,12 +524,18 @@ impl JsonVariantRef<'_> {
|
||||
};
|
||||
JsonNativeType::Array(Box::new(item_type))
|
||||
}
|
||||
JsonVariantRef::Object(object) => JsonNativeType::Object(
|
||||
object
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), native_type(v)))
|
||||
.collect(),
|
||||
),
|
||||
JsonVariantRef::Object(object) => {
|
||||
if object.is_empty() {
|
||||
JsonNativeType::Null
|
||||
} else {
|
||||
JsonNativeType::Object(
|
||||
object
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), native_type(v)))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
JsonType::new_native(native_type(self))
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
mod column_schema;
|
||||
pub mod constraint;
|
||||
pub mod ext;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
25
src/datatypes/src/schema/ext.rs
Normal file
25
src/datatypes/src/schema/ext.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use crate::extension::json;
|
||||
|
||||
pub trait ArrowSchemaExt {
|
||||
fn has_json_extension_field(&self) -> bool;
|
||||
}
|
||||
|
||||
impl ArrowSchemaExt for arrow_schema::Schema {
|
||||
fn has_json_extension_field(&self) -> bool {
|
||||
self.fields().iter().any(json::is_json_extension_type)
|
||||
}
|
||||
}
|
||||
@@ -12,12 +12,14 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::{Debug, Display, Formatter};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, LazyLock};
|
||||
|
||||
use arrow::datatypes::DataType as ArrowDataType;
|
||||
use arrow_schema::Fields;
|
||||
use common_base::bytes::Bytes;
|
||||
use regex::{Captures, Regex};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -33,6 +35,7 @@ use crate::type_id::LogicalTypeId;
|
||||
use crate::types::{ListType, StructField, StructType};
|
||||
use crate::value::Value;
|
||||
use crate::vectors::json::builder::JsonVectorBuilder;
|
||||
use crate::vectors::json::builder2::Json2VectorBuilder;
|
||||
use crate::vectors::{BinaryVectorBuilder, MutableVector};
|
||||
|
||||
pub const JSON_TYPE_NAME: &str = "Json";
|
||||
@@ -164,6 +167,7 @@ pub enum JsonFormat {
|
||||
#[default]
|
||||
Jsonb,
|
||||
Native(Box<JsonNativeType>),
|
||||
Json2,
|
||||
}
|
||||
|
||||
/// JsonType is a data type for JSON data. It is stored as binary data of jsonb format.
|
||||
@@ -188,10 +192,15 @@ impl JsonType {
|
||||
matches!(self.format, JsonFormat::Native(_))
|
||||
}
|
||||
|
||||
pub fn is_json2(&self) -> bool {
|
||||
matches!(self.format, JsonFormat::Json2)
|
||||
}
|
||||
|
||||
pub fn native_type(&self) -> &JsonNativeType {
|
||||
match &self.format {
|
||||
JsonFormat::Jsonb => &JsonNativeType::String,
|
||||
JsonFormat::Native(x) => x.as_ref(),
|
||||
JsonFormat::Json2 => unimplemented!(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,15 +221,24 @@ impl JsonType {
|
||||
ConcreteDataType::Struct(t) => t.clone(),
|
||||
x => plain_json_struct_type(x),
|
||||
},
|
||||
JsonFormat::Json2 => unimplemented!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to merge this json type with others, error on datatype conflict.
|
||||
pub fn merge(&mut self, other: &JsonType) -> Result<()> {
|
||||
self.merge_with(other, false)
|
||||
}
|
||||
|
||||
pub fn merge_with_lifting(&mut self, other: &JsonType) -> Result<()> {
|
||||
self.merge_with(other, true)
|
||||
}
|
||||
|
||||
fn merge_with(&mut self, other: &JsonType, lift: bool) -> Result<()> {
|
||||
match (&self.format, &other.format) {
|
||||
(JsonFormat::Jsonb, JsonFormat::Jsonb) => Ok(()),
|
||||
(JsonFormat::Native(this), JsonFormat::Native(that)) => {
|
||||
let merged = merge(this.as_ref(), that.as_ref())?;
|
||||
let merged = merge(this.as_ref(), that.as_ref(), lift)?;
|
||||
self.format = JsonFormat::Native(Box::new(merged));
|
||||
Ok(())
|
||||
}
|
||||
@@ -313,13 +331,17 @@ fn is_mergeable(this: &JsonNativeType, that: &JsonNativeType) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
fn merge(this: &JsonNativeType, that: &JsonNativeType) -> Result<JsonNativeType> {
|
||||
fn merge_object(this: &JsonObjectType, that: &JsonObjectType) -> Result<JsonObjectType> {
|
||||
fn merge(this: &JsonNativeType, that: &JsonNativeType, lift: bool) -> Result<JsonNativeType> {
|
||||
fn merge_object(
|
||||
this: &JsonObjectType,
|
||||
that: &JsonObjectType,
|
||||
lift: bool,
|
||||
) -> Result<JsonObjectType> {
|
||||
let mut this = this.clone();
|
||||
// merge "that" into "this" directly:
|
||||
for (type_name, that_type) in that {
|
||||
if let Some(this_type) = this.get_mut(type_name) {
|
||||
let merged_type = merge(this_type, that_type)?;
|
||||
let merged_type = merge(this_type, that_type, lift)?;
|
||||
*this_type = merged_type;
|
||||
} else {
|
||||
this.insert(type_name.clone(), that_type.clone());
|
||||
@@ -331,16 +353,45 @@ fn merge(this: &JsonNativeType, that: &JsonNativeType) -> Result<JsonNativeType>
|
||||
match (this, that) {
|
||||
(this, that) if this == that => Ok(this.clone()),
|
||||
(JsonNativeType::Array(this), JsonNativeType::Array(that)) => {
|
||||
merge(this.as_ref(), that.as_ref()).map(|x| JsonNativeType::Array(Box::new(x)))
|
||||
merge(this.as_ref(), that.as_ref(), lift).map(|x| JsonNativeType::Array(Box::new(x)))
|
||||
}
|
||||
(JsonNativeType::Object(this), JsonNativeType::Object(that)) => {
|
||||
merge_object(this, that).map(JsonNativeType::Object)
|
||||
merge_object(this, that, lift).map(JsonNativeType::Object)
|
||||
}
|
||||
(JsonNativeType::Null, x) | (x, JsonNativeType::Null) => Ok(x.clone()),
|
||||
_ => MergeJsonDatatypeSnafu {
|
||||
reason: format!("datatypes have conflict, this: {this}, that: {that}"),
|
||||
_ => {
|
||||
if lift {
|
||||
Ok(JsonNativeType::String)
|
||||
} else {
|
||||
MergeJsonDatatypeSnafu {
|
||||
reason: format!("datatypes have conflict, this: {this}, that: {that}"),
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn merge_as_json_type<'a>(
|
||||
left: &'a ArrowDataType,
|
||||
right: &ArrowDataType,
|
||||
) -> Cow<'a, ArrowDataType> {
|
||||
if left == right {
|
||||
return Cow::Borrowed(left);
|
||||
}
|
||||
|
||||
let mut left = JsonType::from(left);
|
||||
let right = JsonType::from(right);
|
||||
Cow::Owned(if left.merge_with_lifting(&right).is_ok() {
|
||||
left.as_arrow_type()
|
||||
} else {
|
||||
ArrowDataType::Utf8
|
||||
})
|
||||
}
|
||||
|
||||
impl From<&ArrowDataType> for JsonType {
|
||||
fn from(t: &ArrowDataType) -> Self {
|
||||
JsonType::new_native(JsonNativeType::from(&ConcreteDataType::from_arrow_type(t)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -349,6 +400,7 @@ impl DataType for JsonType {
|
||||
match &self.format {
|
||||
JsonFormat::Jsonb => JSON_TYPE_NAME.to_string(),
|
||||
JsonFormat::Native(x) => format!("Json<{x}>"),
|
||||
JsonFormat::Json2 => "JSON2".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -364,6 +416,7 @@ impl DataType for JsonType {
|
||||
match self.format {
|
||||
JsonFormat::Jsonb => ArrowDataType::Binary,
|
||||
JsonFormat::Native(_) => self.as_struct_type().as_arrow_type(),
|
||||
JsonFormat::Json2 => ArrowDataType::Struct(Fields::empty()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -371,6 +424,7 @@ impl DataType for JsonType {
|
||||
match &self.format {
|
||||
JsonFormat::Jsonb => Box::new(BinaryVectorBuilder::with_capacity(capacity)),
|
||||
JsonFormat::Native(x) => Box::new(JsonVectorBuilder::new(*x.clone(), capacity)),
|
||||
JsonFormat::Json2 => Box::new(Json2VectorBuilder::new(JsonNativeType::Null, capacity)),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3206,7 +3206,7 @@ pub(crate) mod tests {
|
||||
]
|
||||
.into(),
|
||||
)),
|
||||
48,
|
||||
56,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ mod duration;
|
||||
mod eq;
|
||||
mod helper;
|
||||
mod interval;
|
||||
pub(crate) mod json;
|
||||
pub mod json;
|
||||
mod list;
|
||||
mod null;
|
||||
pub(crate) mod operations;
|
||||
|
||||
@@ -12,4 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod array;
|
||||
pub(crate) mod builder;
|
||||
pub(crate) mod builder2;
|
||||
|
||||
304
src/datatypes/src/vectors/json/array.rs
Normal file
304
src/datatypes/src/vectors/json/array.rs
Normal file
@@ -0,0 +1,304 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::compute;
|
||||
use arrow::util::display::{ArrayFormatter, FormatOptions};
|
||||
use arrow_array::cast::AsArray;
|
||||
use arrow_array::{Array, ArrayRef, StructArray, new_null_array};
|
||||
use arrow_schema::DataType;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::arrow_array::StringArray;
|
||||
use crate::error::{AlignJsonArraySnafu, ArrowComputeSnafu, Result};
|
||||
|
||||
pub struct JsonArray<'a> {
|
||||
inner: &'a ArrayRef,
|
||||
}
|
||||
|
||||
impl JsonArray<'_> {
|
||||
/// Align a JSON array to the `expect` data type. The `expect` data type is often the
|
||||
/// "largest" JSON type after some insertions in the table schema, while the JSON array previously
|
||||
/// written in the SST could be lagged behind it. So it's important to "align" the JSON array by
|
||||
/// setting the missing fields with null arrays, or casting the data.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - The JSON array is not an Arrow [StructArray], or the provided `expect` data type is not
|
||||
/// of Struct type. Both of which shouldn't happen unless we switch our implementation of how
|
||||
/// JSON array is physically stored.
|
||||
pub fn try_align(&self, expect: &DataType) -> Result<ArrayRef> {
|
||||
let json_type = self.inner.data_type();
|
||||
if json_type == expect {
|
||||
return Ok(self.inner.clone());
|
||||
}
|
||||
|
||||
let struct_array = self.inner.as_struct();
|
||||
let array_fields = struct_array.fields();
|
||||
let array_columns = struct_array.columns();
|
||||
let DataType::Struct(expect_fields) = expect else {
|
||||
unreachable!()
|
||||
};
|
||||
let mut aligned = Vec::with_capacity(expect_fields.len());
|
||||
|
||||
// Compare the fields in the JSON array and the to-be-aligned schema, amending with null arrays
|
||||
// on the way. It's very important to note that fields in the JSON array and those in the JSON type
|
||||
// are both **SORTED**.
|
||||
debug_assert!(expect_fields.iter().map(|f| f.name()).is_sorted());
|
||||
debug_assert!(array_fields.iter().map(|f| f.name()).is_sorted());
|
||||
|
||||
let mut i = 0; // point to the expect fields
|
||||
let mut j = 0; // point to the array fields
|
||||
while i < expect_fields.len() && j < array_fields.len() {
|
||||
let expect_field = &expect_fields[i];
|
||||
let array_field = &array_fields[j];
|
||||
match expect_field.name().cmp(array_field.name()) {
|
||||
Ordering::Equal => {
|
||||
if expect_field.data_type() == array_field.data_type() {
|
||||
aligned.push(array_columns[j].clone());
|
||||
} else {
|
||||
let array = JsonArray::from(&array_columns[j]);
|
||||
if matches!(expect_field.data_type(), DataType::Struct(_)) {
|
||||
// A `StructArray` in a JSON array must be another JSON array.
|
||||
// (Like a nested JSON object in a JSON value.)
|
||||
aligned.push(array.try_align(expect_field.data_type())?);
|
||||
} else {
|
||||
aligned.push(array.try_cast(expect_field.data_type())?);
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
Ordering::Less => {
|
||||
aligned.push(new_null_array(expect_field.data_type(), struct_array.len()));
|
||||
i += 1;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if i < expect_fields.len() {
|
||||
for field in &expect_fields[i..] {
|
||||
aligned.push(new_null_array(field.data_type(), struct_array.len()));
|
||||
}
|
||||
}
|
||||
|
||||
let json_array = StructArray::try_new(
|
||||
expect_fields.clone(),
|
||||
aligned,
|
||||
struct_array.nulls().cloned(),
|
||||
)
|
||||
.map_err(|e| {
|
||||
AlignJsonArraySnafu {
|
||||
reason: e.to_string(),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
Ok(Arc::new(json_array))
|
||||
}
|
||||
|
||||
fn try_cast(&self, to_type: &DataType) -> Result<ArrayRef> {
|
||||
if compute::can_cast_types(self.inner.data_type(), to_type) {
|
||||
return compute::cast(&self.inner, to_type).context(ArrowComputeSnafu);
|
||||
}
|
||||
|
||||
let formatter = ArrayFormatter::try_new(&self.inner, &FormatOptions::default())
|
||||
.context(ArrowComputeSnafu)?;
|
||||
let values = (0..self.inner.len())
|
||||
.map(|i| {
|
||||
self.inner
|
||||
.is_valid(i)
|
||||
.then(|| formatter.value(i).to_string())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
Ok(Arc::new(StringArray::from(values)))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a ArrayRef> for JsonArray<'a> {
|
||||
fn from(inner: &'a ArrayRef) -> Self {
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use arrow_array::types::Int64Type;
|
||||
use arrow_array::{BooleanArray, Float64Array, Int64Array, ListArray};
|
||||
use arrow_schema::{Field, Fields};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_align_json_array() -> Result<()> {
|
||||
struct TestCase {
|
||||
json_array: ArrayRef,
|
||||
schema_type: DataType,
|
||||
expected: std::result::Result<ArrayRef, String>,
|
||||
}
|
||||
|
||||
impl TestCase {
|
||||
fn new(
|
||||
json_array: StructArray,
|
||||
schema_type: Fields,
|
||||
expected: std::result::Result<Vec<ArrayRef>, String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
json_array: Arc::new(json_array),
|
||||
schema_type: DataType::Struct(schema_type.clone()),
|
||||
expected: expected
|
||||
.map(|x| Arc::new(StructArray::new(schema_type, x, None)) as ArrayRef),
|
||||
}
|
||||
}
|
||||
|
||||
fn test(self) -> Result<()> {
|
||||
let result = JsonArray::from(&self.json_array).try_align(&self.schema_type);
|
||||
match (result, self.expected) {
|
||||
(Ok(json_array), Ok(expected)) => assert_eq!(&json_array, &expected),
|
||||
(Ok(json_array), Err(e)) => {
|
||||
panic!("expecting error {e} but actually get: {json_array:?}")
|
||||
}
|
||||
(Err(e), Err(expected)) => assert_eq!(e.to_string(), expected),
|
||||
(Err(e), Ok(_)) => return Err(e),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Test empty json array can be aligned with a complex json type.
|
||||
TestCase::new(
|
||||
StructArray::new_empty_fields(2, None),
|
||||
Fields::from(vec![
|
||||
Field::new("int", DataType::Int64, true),
|
||||
Field::new_struct(
|
||||
"nested",
|
||||
vec![Field::new("bool", DataType::Boolean, true)],
|
||||
true,
|
||||
),
|
||||
Field::new("string", DataType::Utf8, true),
|
||||
]),
|
||||
Ok(vec![
|
||||
Arc::new(Int64Array::new_null(2)) as ArrayRef,
|
||||
Arc::new(StructArray::new_null(
|
||||
Fields::from(vec![Arc::new(Field::new("bool", DataType::Boolean, true))]),
|
||||
2,
|
||||
)),
|
||||
Arc::new(StringArray::new_null(2)),
|
||||
]),
|
||||
)
|
||||
.test()?;
|
||||
|
||||
// Test simple json array alignment.
|
||||
TestCase::new(
|
||||
StructArray::from(vec![(
|
||||
Arc::new(Field::new("float", DataType::Float64, true)),
|
||||
Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef,
|
||||
)]),
|
||||
Fields::from(vec![
|
||||
Field::new("float", DataType::Float64, true),
|
||||
Field::new("string", DataType::Utf8, true),
|
||||
]),
|
||||
Ok(vec![
|
||||
Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef,
|
||||
Arc::new(StringArray::new_null(3)),
|
||||
]),
|
||||
)
|
||||
.test()?;
|
||||
|
||||
// Test complex json array alignment.
|
||||
TestCase::new(
|
||||
StructArray::from(vec![
|
||||
(
|
||||
Arc::new(Field::new_list(
|
||||
"list",
|
||||
Field::new_list_field(DataType::Int64, true),
|
||||
true,
|
||||
)),
|
||||
Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
|
||||
Some(vec![Some(1)]),
|
||||
None,
|
||||
Some(vec![Some(2), Some(3)]),
|
||||
])) as ArrayRef,
|
||||
),
|
||||
(
|
||||
Arc::new(Field::new_struct(
|
||||
"nested",
|
||||
vec![Field::new("int", DataType::Int64, true)],
|
||||
true,
|
||||
)),
|
||||
Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("int", DataType::Int64, true)),
|
||||
Arc::new(Int64Array::from(vec![-1, -2, -3])) as ArrayRef,
|
||||
)])),
|
||||
),
|
||||
(
|
||||
Arc::new(Field::new("string", DataType::Utf8, true)),
|
||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
||||
),
|
||||
]),
|
||||
Fields::from(vec![
|
||||
Field::new("bool", DataType::Boolean, true),
|
||||
Field::new_list("list", Field::new_list_field(DataType::Int64, true), true),
|
||||
Field::new_struct(
|
||||
"nested",
|
||||
vec![
|
||||
Field::new("float", DataType::Float64, true),
|
||||
Field::new("int", DataType::Int64, true),
|
||||
],
|
||||
true,
|
||||
),
|
||||
Field::new("string", DataType::Utf8, true),
|
||||
]),
|
||||
Ok(vec![
|
||||
Arc::new(BooleanArray::new_null(3)) as ArrayRef,
|
||||
Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
|
||||
Some(vec![Some(1)]),
|
||||
None,
|
||||
Some(vec![Some(2), Some(3)]),
|
||||
])),
|
||||
Arc::new(StructArray::from(vec![
|
||||
(
|
||||
Arc::new(Field::new("float", DataType::Float64, true)),
|
||||
Arc::new(Float64Array::new_null(3)) as ArrayRef,
|
||||
),
|
||||
(
|
||||
Arc::new(Field::new("int", DataType::Int64, true)),
|
||||
Arc::new(Int64Array::from(vec![-1, -2, -3])),
|
||||
),
|
||||
])),
|
||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
||||
]),
|
||||
)
|
||||
.test()?;
|
||||
|
||||
// Test align failed.
|
||||
TestCase::new(
|
||||
StructArray::try_from(vec![
|
||||
("i", Arc::new(Int64Array::from(vec![1])) as ArrayRef),
|
||||
("j", Arc::new(Int64Array::from(vec![2])) as ArrayRef),
|
||||
])
|
||||
.unwrap(),
|
||||
Fields::from(vec![Field::new("i", DataType::Int64, true)]),
|
||||
Err(
|
||||
r#"Failed to align JSON array, reason: this json array has more fields ["j"]"#
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
.test()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
163
src/datatypes/src/vectors/json/builder2.rs
Normal file
163
src/datatypes/src/vectors/json/builder2.rs
Normal file
@@ -0,0 +1,163 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::any::Any;
|
||||
use std::borrow::Cow;
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use crate::data_type::ConcreteDataType;
|
||||
use crate::error::{Result, TryFromValueSnafu, UnsupportedOperationSnafu};
|
||||
use crate::json::value::{JsonValue, JsonValueRef, JsonVariant};
|
||||
use crate::prelude::{ValueRef, Vector, VectorRef};
|
||||
use crate::types::JsonType;
|
||||
use crate::types::json_type::JsonNativeType;
|
||||
use crate::vectors::{MutableVector, StructVectorBuilder};
|
||||
|
||||
pub(crate) struct Json2VectorBuilder {
|
||||
merged_type: JsonType,
|
||||
capacity: usize,
|
||||
values: Vec<JsonValue>,
|
||||
}
|
||||
|
||||
impl Json2VectorBuilder {
|
||||
pub(crate) fn new(json_type: JsonNativeType, capacity: usize) -> Self {
|
||||
Self {
|
||||
merged_type: JsonType::new_native(json_type),
|
||||
capacity,
|
||||
values: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
fn build(&self) -> VectorRef {
|
||||
let mut builder = StructVectorBuilder::with_type_and_capacity(
|
||||
self.merged_type.as_struct_type(),
|
||||
self.capacity,
|
||||
);
|
||||
for value in self.values.iter() {
|
||||
let value = align_json_value_with_type(&self.merged_type, value);
|
||||
builder
|
||||
.try_push_value_ref(&(*value).as_ref().as_value_ref())
|
||||
// Safety: after the `align_json_value_with_type`, the values to push must have
|
||||
// the same types with the builder, so it's not expected to meet any errors here.
|
||||
.unwrap_or_else(|e| panic!("Failed to push JSON value {value}: {e:?}"));
|
||||
}
|
||||
builder.to_vector()
|
||||
}
|
||||
}
|
||||
|
||||
impl MutableVector for Json2VectorBuilder {
|
||||
fn data_type(&self) -> ConcreteDataType {
|
||||
ConcreteDataType::Json(self.merged_type.clone())
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.values.len()
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn as_mut_any(&mut self) -> &mut dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn to_vector(&mut self) -> VectorRef {
|
||||
self.build()
|
||||
}
|
||||
|
||||
fn to_vector_cloned(&self) -> VectorRef {
|
||||
self.build()
|
||||
}
|
||||
|
||||
fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
|
||||
let ValueRef::Json(value) = value else {
|
||||
return TryFromValueSnafu {
|
||||
reason: format!("expected json value, got {value:?}"),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
let json_type = value.json_type();
|
||||
self.merged_type.merge_with_lifting(json_type)?;
|
||||
|
||||
let value = JsonValue::from(value.clone().into_variant());
|
||||
self.values.push(value);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn push_null(&mut self) {
|
||||
static NULL_JSON: LazyLock<ValueRef> =
|
||||
LazyLock::new(|| ValueRef::Json(Box::new(JsonValueRef::null())));
|
||||
self.try_push_value_ref(&NULL_JSON)
|
||||
// Safety: learning from the method "try_push_value_ref", a null json value should be
|
||||
// always able to push into any json vectors.
|
||||
.unwrap_or_else(|e| panic!("failed to push null json value, error: {e}"));
|
||||
}
|
||||
|
||||
fn extend_slice_of(&mut self, _: &dyn Vector, _: usize, _: usize) -> Result<()> {
|
||||
UnsupportedOperationSnafu {
|
||||
op: "extend_slice_of",
|
||||
vector_type: "JsonVector",
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
}
|
||||
|
||||
fn align_json_value_with_type<'a>(
|
||||
expected_type: &JsonType,
|
||||
value: &'a JsonValue,
|
||||
) -> Cow<'a, JsonValue> {
|
||||
if value.json_type() == expected_type {
|
||||
return Cow::Borrowed(value);
|
||||
}
|
||||
|
||||
fn helper(expected_type: &JsonNativeType, value: JsonVariant) -> JsonVariant {
|
||||
match (expected_type, value) {
|
||||
(_, JsonVariant::Null) | (JsonNativeType::Null, _) => JsonVariant::Null,
|
||||
(JsonNativeType::Bool, JsonVariant::Bool(v)) => JsonVariant::Bool(v),
|
||||
(JsonNativeType::Number(_), JsonVariant::Number(v)) => JsonVariant::Number(v),
|
||||
(JsonNativeType::String, JsonVariant::String(v)) => JsonVariant::String(v),
|
||||
|
||||
(JsonNativeType::Array(item_type), JsonVariant::Array(items)) => JsonVariant::Array(
|
||||
items
|
||||
.into_iter()
|
||||
.map(|item| helper(item_type.as_ref(), item))
|
||||
.collect(),
|
||||
),
|
||||
|
||||
(JsonNativeType::Object(expected_fields), JsonVariant::Object(object)) => {
|
||||
JsonVariant::Object(
|
||||
expected_fields
|
||||
.iter()
|
||||
.map(|(field_name, expected_field_type)| {
|
||||
let value =
|
||||
object.get(field_name).cloned().unwrap_or(JsonVariant::Null);
|
||||
(field_name.clone(), helper(expected_field_type, value))
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
(JsonNativeType::String, v) => {
|
||||
let json: serde_json::Value = JsonValue::from(v).into();
|
||||
JsonVariant::String(json.to_string())
|
||||
}
|
||||
|
||||
(t, v) => panic!("unsupported json alignment cast from {v} to {t}"),
|
||||
}
|
||||
}
|
||||
|
||||
let value = helper(expected_type.native_type(), value.clone().into_variant());
|
||||
Cow::Owned(JsonValue::from(value))
|
||||
}
|
||||
@@ -16,6 +16,7 @@ workspace = true
|
||||
[dependencies]
|
||||
api.workspace = true
|
||||
aquamarine.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
async-channel = "1.9"
|
||||
common-stat.workspace = true
|
||||
async-stream.workspace = true
|
||||
|
||||
@@ -229,6 +229,7 @@ fn bulk_part_converter(c: &mut Criterion) {
|
||||
&FlatSchemaOptions {
|
||||
raw_pk_columns: false,
|
||||
string_pk_use_dict: false,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
let mut converter = BulkPartConverter::new(&metadata, schema, rows, codec, false);
|
||||
@@ -255,6 +256,7 @@ fn bulk_part_converter(c: &mut Criterion) {
|
||||
&FlatSchemaOptions {
|
||||
raw_pk_columns: true,
|
||||
string_pk_use_dict: true,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
let mut converter =
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use arrow_schema::SchemaRef;
|
||||
use async_stream::try_stream;
|
||||
use common_time::Timestamp;
|
||||
use futures::{Stream, TryStreamExt};
|
||||
@@ -403,7 +404,12 @@ impl AccessLayer {
|
||||
}
|
||||
FormatType::Flat => {
|
||||
writer
|
||||
.write_all_flat(request.source, request.max_sequence, write_opts)
|
||||
.write_all_flat(
|
||||
request.source,
|
||||
request.schema,
|
||||
request.max_sequence,
|
||||
write_opts,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
}
|
||||
@@ -540,6 +546,8 @@ pub struct SstWriteRequest {
|
||||
pub bloom_filter_index_config: BloomFilterConfig,
|
||||
#[cfg(feature = "vector_index")]
|
||||
pub vector_index_config: crate::config::VectorIndexConfig,
|
||||
/// The Arrow schema of the `RecordBatch`s in the [FlatSource].
|
||||
pub schema: Option<SchemaRef>,
|
||||
}
|
||||
|
||||
/// Cleaner to remove temp files on the atomic write dir.
|
||||
|
||||
10
src/mito2/src/cache/write_cache.rs
vendored
10
src/mito2/src/cache/write_cache.rs
vendored
@@ -256,7 +256,12 @@ impl WriteCache {
|
||||
}
|
||||
crate::sst::FormatType::Flat => {
|
||||
writer
|
||||
.write_all_flat(write_request.source, write_request.max_sequence, write_opts)
|
||||
.write_all_flat(
|
||||
write_request.source,
|
||||
write_request.schema,
|
||||
write_request.max_sequence,
|
||||
write_opts,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
@@ -561,6 +566,7 @@ mod tests {
|
||||
bloom_filter_index_config: Default::default(),
|
||||
#[cfg(feature = "vector_index")]
|
||||
vector_index_config: Default::default(),
|
||||
schema: None,
|
||||
};
|
||||
|
||||
let upload_request = SstUploadRequest {
|
||||
@@ -664,6 +670,7 @@ mod tests {
|
||||
bloom_filter_index_config: Default::default(),
|
||||
#[cfg(feature = "vector_index")]
|
||||
vector_index_config: Default::default(),
|
||||
schema: None,
|
||||
};
|
||||
let write_opts = WriteOptions {
|
||||
row_group_size: 512,
|
||||
@@ -755,6 +762,7 @@ mod tests {
|
||||
bloom_filter_index_config: Default::default(),
|
||||
#[cfg(feature = "vector_index")]
|
||||
vector_index_config: Default::default(),
|
||||
schema: None,
|
||||
};
|
||||
let write_opts = WriteOptions {
|
||||
row_group_size: 512,
|
||||
|
||||
@@ -29,6 +29,7 @@ use std::time::Instant;
|
||||
|
||||
use api::v1::region::compact_request;
|
||||
use api::v1::region::compact_request::Options;
|
||||
use arrow_schema::SchemaRef;
|
||||
use common_base::Plugins;
|
||||
use common_memory_manager::OnExhaustedPolicy;
|
||||
use common_meta::key::SchemaMetadataManagerRef;
|
||||
@@ -60,7 +61,7 @@ use crate::error::{
|
||||
use crate::metrics::{COMPACTION_STAGE_ELAPSED, INFLIGHT_COMPACTION_COUNT};
|
||||
use crate::read::BoxedRecordBatchStream;
|
||||
use crate::read::projection::ProjectionMapper;
|
||||
use crate::read::scan_region::{PredicateGroup, ScanInput};
|
||||
use crate::read::scan_region::{PredicateGroup, ScanInput, concretize_json2_types};
|
||||
use crate::read::seq_scan::SeqScan;
|
||||
use crate::region::options::{MergeMode, RegionOptions};
|
||||
use crate::region::version::VersionControlRef;
|
||||
@@ -839,15 +840,17 @@ struct CompactionSstReaderBuilder<'a> {
|
||||
|
||||
impl CompactionSstReaderBuilder<'_> {
|
||||
/// Builds [BoxedRecordBatchStream] that reads all SST files and yields batches in flat format for compaction.
|
||||
async fn build_flat_sst_reader(self) -> Result<BoxedRecordBatchStream> {
|
||||
let scan_input = self.build_scan_input()?.with_compaction(true);
|
||||
async fn build_flat_sst_reader(self) -> Result<(SchemaRef, BoxedRecordBatchStream)> {
|
||||
let scan_input = self.build_scan_input().await?.with_compaction(true);
|
||||
|
||||
SeqScan::new(scan_input)
|
||||
let schema = scan_input.mapper.output_schema().arrow_schema().clone();
|
||||
let reader = SeqScan::new(scan_input)
|
||||
.build_flat_reader_for_compaction()
|
||||
.await
|
||||
.await?;
|
||||
Ok((schema, reader))
|
||||
}
|
||||
|
||||
fn build_scan_input(self) -> Result<ScanInput> {
|
||||
async fn build_scan_input(self) -> Result<ScanInput> {
|
||||
let mapper = ProjectionMapper::all(&self.metadata, true)?;
|
||||
let mut scan_input = ScanInput::new(self.sst_layer, mapper)
|
||||
.with_files(self.inputs.to_vec())
|
||||
@@ -867,7 +870,7 @@ impl CompactionSstReaderBuilder<'_> {
|
||||
scan_input.with_predicate(time_range_to_predicate(time_range, &self.metadata)?);
|
||||
}
|
||||
|
||||
Ok(scan_input)
|
||||
concretize_json2_types(scan_input).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -356,7 +356,7 @@ impl DefaultCompactor {
|
||||
time_range: output.output_time_range,
|
||||
merge_mode,
|
||||
};
|
||||
let reader = builder.build_flat_sst_reader().await?;
|
||||
let (schema, reader) = builder.build_flat_sst_reader().await?;
|
||||
let source = FlatSource::Stream(reader);
|
||||
let mut metrics = Metrics::new(WriteType::Compaction);
|
||||
let region_metadata = compaction_region.region_metadata.clone();
|
||||
@@ -367,6 +367,7 @@ impl DefaultCompactor {
|
||||
op_type: OperationType::Compact,
|
||||
metadata: region_metadata.clone(),
|
||||
source,
|
||||
schema: Some(schema),
|
||||
cache_manager: compaction_region.cache_manager.clone(),
|
||||
storage,
|
||||
max_sequence: max_sequence.map(NonZero::get),
|
||||
|
||||
@@ -543,14 +543,14 @@ impl RegionFlushTask {
|
||||
write_opts: &WriteOptions,
|
||||
mem_ranges: MemtableRanges,
|
||||
) -> Result<FlushFlatMemResult> {
|
||||
let batch_schema = to_flat_sst_arrow_schema(
|
||||
&version.metadata,
|
||||
&FlatSchemaOptions::from_encoding(version.metadata.primary_key_encoding),
|
||||
);
|
||||
let mut options = FlatSchemaOptions::from_encoding(version.metadata.primary_key_encoding);
|
||||
options.override_schema = mem_ranges.schema();
|
||||
|
||||
let batch_schema = to_flat_sst_arrow_schema(&version.metadata, &options);
|
||||
let field_column_start =
|
||||
flat_format::field_column_start(&version.metadata, batch_schema.fields().len());
|
||||
let flat_sources = memtable_flat_sources(
|
||||
batch_schema,
|
||||
batch_schema.clone(),
|
||||
mem_ranges,
|
||||
&version.options,
|
||||
field_column_start,
|
||||
@@ -558,7 +558,8 @@ impl RegionFlushTask {
|
||||
let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len());
|
||||
let num_encoded = flat_sources.encoded.len();
|
||||
for (source, max_sequence) in flat_sources.sources {
|
||||
let write_request = self.new_write_request(version, max_sequence, source);
|
||||
let write_request =
|
||||
self.new_write_request(version, max_sequence, source, batch_schema.clone());
|
||||
let access_layer = self.access_layer.clone();
|
||||
let write_opts = write_opts.clone();
|
||||
let semaphore = self.flush_semaphore.clone();
|
||||
@@ -629,6 +630,7 @@ impl RegionFlushTask {
|
||||
version: &VersionRef,
|
||||
max_sequence: u64,
|
||||
source: FlatSource,
|
||||
schema: SchemaRef,
|
||||
) -> SstWriteRequest {
|
||||
let flat_format = version
|
||||
.options
|
||||
@@ -639,6 +641,7 @@ impl RegionFlushTask {
|
||||
op_type: OperationType::Flush,
|
||||
metadata: version.metadata.clone(),
|
||||
source,
|
||||
schema: Some(schema),
|
||||
cache_manager: self.cache_manager.clone(),
|
||||
storage: version.options.storage.clone(),
|
||||
max_sequence: Some(max_sequence),
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
//! Memtables are write buffers for regions.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
@@ -62,6 +63,10 @@ pub use bulk::part::{
|
||||
BulkPart, BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
|
||||
sort_primary_key_record_batch,
|
||||
};
|
||||
use datatypes::arrow::datatypes::{Schema, SchemaRef};
|
||||
use datatypes::extension::json;
|
||||
use datatypes::schema::ext::ArrowSchemaExt;
|
||||
use datatypes::types::json_type;
|
||||
#[cfg(any(test, feature = "test"))]
|
||||
pub use time_partition::filter_record_batch;
|
||||
|
||||
@@ -228,6 +233,55 @@ impl MemtableRanges {
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
pub(crate) fn schema(&self) -> Option<SchemaRef> {
|
||||
let mut schemas = self
|
||||
.ranges
|
||||
.values()
|
||||
.filter_map(|x| x.record_batch_schema())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if schemas.iter().all(|x| !x.has_json_extension_field()) {
|
||||
// If there are no JSON extension fields in any schemas, the invariant must be hold,
|
||||
// that all schemas are same (they are all derived from same region metadata).
|
||||
// So it's ok to return the first one as the schema of the whole memtable ranges.
|
||||
return (!schemas.is_empty()).then(|| schemas.swap_remove(0));
|
||||
}
|
||||
|
||||
// If there are JSON extension fields, by convention, only their concrete data types
|
||||
// (Arrow's Struct) may differ. Other things like the metadata or the fields count are same.
|
||||
// So to produce the final schema, we can solely merge the data types.
|
||||
schemas
|
||||
.split_first()
|
||||
.map(|(first, rest)| merge_json_extension_fields(first, rest))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn merge_json_extension_fields(base: &SchemaRef, others: &[SchemaRef]) -> SchemaRef {
|
||||
let mut fields = base.fields().iter().cloned().collect::<Vec<_>>();
|
||||
for (i, field) in fields.iter_mut().enumerate() {
|
||||
if !json::is_json_extension_type(field) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let merged = others
|
||||
.iter()
|
||||
.map(|x| Cow::Borrowed(x.field(i).data_type()))
|
||||
.reduce(|acc, e| {
|
||||
Cow::Owned(json_type::merge_as_json_type(acc.as_ref(), e.as_ref()).into_owned())
|
||||
});
|
||||
if let Some(merged) = merged
|
||||
&& field.data_type() != merged.as_ref()
|
||||
{
|
||||
let merged =
|
||||
json_type::merge_as_json_type(field.data_type(), merged.as_ref()).into_owned();
|
||||
|
||||
let mut new = field.as_ref().clone();
|
||||
new.set_data_type(merged);
|
||||
*field = Arc::new(new);
|
||||
}
|
||||
}
|
||||
Arc::new(Schema::new_with_metadata(fields, base.metadata().clone()))
|
||||
}
|
||||
|
||||
impl IterBuilder for MemtableRanges {
|
||||
@@ -552,6 +606,11 @@ pub trait IterBuilder: Send + Sync {
|
||||
.fail()
|
||||
}
|
||||
|
||||
/// Returns the schema of record batches produced by this iterator.
|
||||
fn record_batch_schema(&self) -> Option<SchemaRef> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns the [EncodedRange] if the range is already encoded into SST.
|
||||
fn encoded_range(&self) -> Option<EncodedRange> {
|
||||
None
|
||||
@@ -729,6 +788,11 @@ impl MemtableRange {
|
||||
self.context.builder.is_record_batch()
|
||||
}
|
||||
|
||||
/// Returns the schema of record batches if this range supports record batch iteration.
|
||||
pub fn record_batch_schema(&self) -> Option<SchemaRef> {
|
||||
self.context.builder.record_batch_schema()
|
||||
}
|
||||
|
||||
pub fn num_rows(&self) -> usize {
|
||||
self.stats.num_rows
|
||||
}
|
||||
|
||||
@@ -812,6 +812,10 @@ impl IterBuilder for BulkRangeIterBuilder {
|
||||
fn encoded_range(&self) -> Option<EncodedRange> {
|
||||
None
|
||||
}
|
||||
|
||||
fn record_batch_schema(&self) -> Option<SchemaRef> {
|
||||
Some(self.part.batch.schema())
|
||||
}
|
||||
}
|
||||
|
||||
impl IterBuilder for MultiBulkRangeIterBuilder {
|
||||
@@ -844,6 +848,10 @@ impl IterBuilder for MultiBulkRangeIterBuilder {
|
||||
fn encoded_range(&self) -> Option<EncodedRange> {
|
||||
None
|
||||
}
|
||||
|
||||
fn record_batch_schema(&self) -> Option<SchemaRef> {
|
||||
self.part.record_batch_schema()
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator builder for encoded bulk range
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
//! Bulk part encoder/decoder.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -23,7 +24,7 @@ use api::v1::bulk_wal_entry::Body;
|
||||
use api::v1::{ArrowIpc, BulkWalEntry, Mutation, OpType, bulk_wal_entry};
|
||||
use bytes::Bytes;
|
||||
use common_grpc::flight::{FlightDecoder, FlightEncoder, FlightMessage};
|
||||
use common_recordbatch::DfRecordBatch as RecordBatch;
|
||||
use common_recordbatch::{DfRecordBatch as RecordBatch, recordbatch};
|
||||
use common_time::Timestamp;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use datatypes::arrow;
|
||||
@@ -39,9 +40,13 @@ use datatypes::arrow::datatypes::{
|
||||
};
|
||||
use datatypes::arrow_array::BinaryArray;
|
||||
use datatypes::data_type::DataType;
|
||||
use datatypes::extension::json::is_json_extension_type;
|
||||
use datatypes::prelude::{MutableVector, ScalarVectorBuilder, Vector};
|
||||
use datatypes::schema::ext::ArrowSchemaExt;
|
||||
use datatypes::types::json_type;
|
||||
use datatypes::value::{Value, ValueRef};
|
||||
use datatypes::vectors::Helper;
|
||||
use datatypes::vectors::json::array::JsonArray;
|
||||
use mito_codec::key_values::{KeyValue, KeyValues, KeyValuesRef};
|
||||
use mito_codec::row_converter::{
|
||||
DensePrimaryKeyCodec, PrimaryKeyCodec, PrimaryKeyCodecExt, build_primary_key_codec,
|
||||
@@ -60,9 +65,10 @@ use store_api::storage::{FileId, RegionId, SequenceNumber, SequenceRange};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error::{
|
||||
self, ColumnNotFoundSnafu, ComputeArrowSnafu, ConvertColumnDataTypeSnafu, CreateDefaultSnafu,
|
||||
DataTypeMismatchSnafu, EncodeMemtableSnafu, EncodeSnafu, InvalidMetadataSnafu,
|
||||
InvalidRequestSnafu, NewRecordBatchSnafu, Result, UnexpectedSnafu,
|
||||
self, ColumnNotFoundSnafu, ComputeArrowSnafu, ConvertColumnDataTypeSnafu, ConvertValueSnafu,
|
||||
CreateDefaultSnafu, DataTypeMismatchSnafu, EncodeMemtableSnafu, EncodeSnafu,
|
||||
InvalidMetadataSnafu, InvalidRequestSnafu, NewRecordBatchSnafu, RecordBatchSnafu, Result,
|
||||
UnexpectedSnafu,
|
||||
};
|
||||
use crate::memtable::bulk::context::BulkIterContextRef;
|
||||
use crate::memtable::bulk::part_reader::EncodedBulkPartIter;
|
||||
@@ -436,13 +442,15 @@ impl UnorderedPart {
|
||||
return Ok(Some(self.parts[0].batch.clone()));
|
||||
}
|
||||
|
||||
// Get the schema from the first part
|
||||
// Get the schema from the first part and normalize JSON2 columns across all parts.
|
||||
let schema = self.parts[0].batch.schema();
|
||||
|
||||
// Concatenate all record batches
|
||||
let batches: Vec<RecordBatch> = self.parts.iter().map(|p| p.batch.clone()).collect();
|
||||
let concatenated =
|
||||
arrow::compute::concat_batches(&schema, &batches).context(ComputeArrowSnafu)?;
|
||||
let concatenated = if schema.has_json_extension_field() {
|
||||
let (schema, batches) = normalize_json_columns_for_concat(schema, &self.parts)?;
|
||||
arrow::compute::concat_batches(&schema, &batches).context(ComputeArrowSnafu)?
|
||||
} else {
|
||||
arrow::compute::concat_batches(&schema, self.parts.iter().map(|x| &x.batch))
|
||||
.context(ComputeArrowSnafu)?
|
||||
};
|
||||
|
||||
// Sort the concatenated batch
|
||||
let sorted_batch = sort_primary_key_record_batch(&concatenated)?;
|
||||
@@ -477,6 +485,81 @@ impl UnorderedPart {
|
||||
self.max_timestamp = i64::MIN;
|
||||
self.max_sequence = 0;
|
||||
}
|
||||
|
||||
pub(crate) fn parts(&self) -> &[BulkPart] {
|
||||
&self.parts
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_json_columns_for_concat(
|
||||
base_schema: SchemaRef,
|
||||
parts: &[BulkPart],
|
||||
) -> Result<(SchemaRef, Vec<RecordBatch>)> {
|
||||
debug_assert!(
|
||||
parts
|
||||
.iter()
|
||||
.all(|x| x.batch.schema().fields().len() == base_schema.fields().len())
|
||||
);
|
||||
|
||||
let mut merged_json_types = HashMap::new();
|
||||
for (index, field) in base_schema.fields().iter().enumerate() {
|
||||
if !is_json_extension_type(field) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let merged = parts
|
||||
.iter()
|
||||
.map(|x| Cow::Borrowed(x.batch.schema_ref().field(index).data_type()))
|
||||
.reduce(|acc, e| {
|
||||
Cow::Owned(json_type::merge_as_json_type(acc.as_ref(), e.as_ref()).into_owned())
|
||||
});
|
||||
if let Some(merged) = merged {
|
||||
merged_json_types.insert(index, merged.into_owned());
|
||||
}
|
||||
}
|
||||
|
||||
if merged_json_types.is_empty() {
|
||||
let batches = parts.iter().map(|p| p.batch.clone()).collect();
|
||||
return Ok((base_schema, batches));
|
||||
}
|
||||
|
||||
let fields = base_schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(index, field)| {
|
||||
if let Some(data_type) = merged_json_types.get(&index) {
|
||||
Arc::new(
|
||||
Field::new(field.name().clone(), data_type.clone(), field.is_nullable())
|
||||
.with_metadata(field.metadata().clone()),
|
||||
)
|
||||
} else {
|
||||
field.clone()
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let normalized_schema = Arc::new(Schema::new(fields));
|
||||
|
||||
let mut normalized_batches = Vec::with_capacity(parts.len());
|
||||
for part in parts {
|
||||
let mut columns = Vec::with_capacity(part.batch.num_columns());
|
||||
for (index, column) in part.batch.columns().iter().enumerate() {
|
||||
if let Some(target_type) = merged_json_types.get(&index) {
|
||||
columns.push(
|
||||
JsonArray::from(column)
|
||||
.try_align(target_type)
|
||||
.context(ConvertValueSnafu)?,
|
||||
);
|
||||
} else {
|
||||
columns.push(column.clone());
|
||||
}
|
||||
}
|
||||
let batch = RecordBatch::try_new(normalized_schema.clone(), columns)
|
||||
.context(NewRecordBatchSnafu)?;
|
||||
normalized_batches.push(batch);
|
||||
}
|
||||
|
||||
Ok((normalized_schema, normalized_batches))
|
||||
}
|
||||
|
||||
/// More accurate estimation of the size of a record batch.
|
||||
@@ -693,7 +776,8 @@ impl BulkPartConverter {
|
||||
columns.push(values.sequence.to_arrow_array());
|
||||
columns.push(values.op_type.to_arrow_array());
|
||||
|
||||
let batch = RecordBatch::try_new(self.schema, columns).context(NewRecordBatchSnafu)?;
|
||||
let schema = align_schema_with_json_array(self.schema, &columns);
|
||||
let batch = RecordBatch::try_new(schema, columns).context(NewRecordBatchSnafu)?;
|
||||
// Sorts the record batch.
|
||||
let batch = sort_primary_key_record_batch(&batch)?;
|
||||
|
||||
@@ -708,6 +792,26 @@ impl BulkPartConverter {
|
||||
}
|
||||
}
|
||||
|
||||
fn align_schema_with_json_array(schema: SchemaRef, columns: &[ArrayRef]) -> SchemaRef {
|
||||
if schema.fields().iter().all(|f| !is_json_extension_type(f)) {
|
||||
return schema;
|
||||
}
|
||||
|
||||
let mut fields = Vec::with_capacity(schema.fields().len());
|
||||
for (field, array) in schema.fields().iter().zip(columns) {
|
||||
if !is_json_extension_type(field) {
|
||||
fields.push(field.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut field = field.as_ref().clone();
|
||||
field.set_data_type(array.data_type().clone());
|
||||
fields.push(Arc::new(field));
|
||||
}
|
||||
|
||||
Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone()))
|
||||
}
|
||||
|
||||
fn new_primary_key_column_builders(
|
||||
metadata: &RegionMetadata,
|
||||
capacity: usize,
|
||||
@@ -1343,6 +1447,11 @@ impl MultiBulkPart {
|
||||
self.series_count
|
||||
}
|
||||
|
||||
/// Returns the schema of batches in this part.
|
||||
pub(crate) fn record_batch_schema(&self) -> Option<SchemaRef> {
|
||||
self.batches.first().map(|batch| batch.schema())
|
||||
}
|
||||
|
||||
/// Returns the number of record batches in this part.
|
||||
pub fn num_batches(&self) -> usize {
|
||||
self.batches.len()
|
||||
@@ -1822,6 +1931,7 @@ mod tests {
|
||||
&FlatSchemaOptions {
|
||||
raw_pk_columns: false,
|
||||
string_pk_use_dict: true,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
@@ -2259,6 +2369,7 @@ mod tests {
|
||||
&FlatSchemaOptions {
|
||||
raw_pk_columns: false,
|
||||
string_pk_use_dict: true,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
|
||||
@@ -892,7 +892,9 @@ impl ValueBuilder {
|
||||
size += field_value.data_size();
|
||||
if !field_value.is_null() || self.fields[idx].is_some() {
|
||||
if let Some(field) = self.fields[idx].as_mut() {
|
||||
let _ = field.push(field_value);
|
||||
field
|
||||
.push(field_value)
|
||||
.unwrap_or_else(|e| panic!("Failed to push field value: {e:?}"));
|
||||
} else {
|
||||
let mut mutable_vector =
|
||||
if let ConcreteDataType::String(_) = &self.field_types[idx] {
|
||||
|
||||
@@ -18,7 +18,6 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use common_recordbatch::recordbatch::align_json_array;
|
||||
use datatypes::arrow::array::{
|
||||
Array, ArrayRef, BinaryArray, BinaryBuilder, DictionaryArray, UInt32Array,
|
||||
};
|
||||
@@ -28,6 +27,7 @@ use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::prelude::DataType;
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::json::array::JsonArray;
|
||||
use datatypes::vectors::{Helper, VectorRef};
|
||||
use mito_codec::row_converter::{
|
||||
CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec,
|
||||
@@ -39,8 +39,8 @@ use store_api::metadata::{RegionMetadata, RegionMetadataRef};
|
||||
use store_api::storage::ColumnId;
|
||||
|
||||
use crate::error::{
|
||||
CastVectorSnafu, CompatReaderSnafu, ComputeArrowSnafu, ConvertVectorSnafu, CreateDefaultSnafu,
|
||||
DecodeSnafu, EncodeSnafu, NewRecordBatchSnafu, RecordBatchSnafu, Result, UnexpectedSnafu,
|
||||
CastVectorSnafu, CompatReaderSnafu, ComputeArrowSnafu, ConvertValueSnafu, ConvertVectorSnafu,
|
||||
CreateDefaultSnafu, DecodeSnafu, EncodeSnafu, NewRecordBatchSnafu, Result, UnexpectedSnafu,
|
||||
UnsupportedOperationSnafu,
|
||||
};
|
||||
use crate::read::flat_projection::{FlatProjectionMapper, flat_projected_columns};
|
||||
@@ -354,8 +354,9 @@ impl FlatCompatBatch {
|
||||
|
||||
if let Some(ty) = cast_type {
|
||||
let casted = if let Some(json_type) = ty.as_json() {
|
||||
align_json_array(old_column, &json_type.as_arrow_type())
|
||||
.context(RecordBatchSnafu)?
|
||||
JsonArray::from(old_column)
|
||||
.try_align(&json_type.as_arrow_type())
|
||||
.context(ConvertValueSnafu)?
|
||||
} else {
|
||||
datatypes::arrow::compute::cast(old_column, &ty.as_arrow_type())
|
||||
.context(ComputeArrowSnafu)?
|
||||
@@ -474,10 +475,9 @@ impl CompatFields {
|
||||
|
||||
let data = if let Some(ty) = cast_type {
|
||||
if let Some(json_type) = ty.as_json() {
|
||||
let json_array = old_column.data.to_arrow_array();
|
||||
let json_array =
|
||||
align_json_array(&json_array, &json_type.as_arrow_type())
|
||||
.context(RecordBatchSnafu)?;
|
||||
let json_array = JsonArray::from(&old_column.data.to_arrow_array())
|
||||
.try_align(&json_type.as_arrow_type())
|
||||
.context(ConvertValueSnafu)?;
|
||||
Helper::try_into_vector(&json_array).context(ConvertVectorSnafu)?
|
||||
} else {
|
||||
old_column.data.cast(ty).with_context(|_| CastVectorSnafu {
|
||||
|
||||
@@ -17,15 +17,20 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use arrow_schema::extension::ExtensionType;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu};
|
||||
use common_recordbatch::error::{
|
||||
ArrowComputeSnafu, DataTypesSnafu, ExternalSnafu, NewDfRecordBatchSnafu,
|
||||
};
|
||||
use common_recordbatch::{DfRecordBatch, RecordBatch};
|
||||
use datatypes::arrow::array::Array;
|
||||
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
|
||||
use datatypes::extension::json::JsonExtensionType;
|
||||
use datatypes::prelude::{ConcreteDataType, DataType};
|
||||
use datatypes::schema::{Schema, SchemaRef};
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::Helper;
|
||||
use datatypes::vectors::json::array::JsonArray;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::metadata::{RegionMetadata, RegionMetadataRef};
|
||||
use store_api::storage::ColumnId;
|
||||
@@ -43,6 +48,7 @@ use crate::sst::{
|
||||
///
|
||||
/// This mapper support duplicate and unsorted projection indices.
|
||||
/// The output schema is determined by the projection indices.
|
||||
#[derive(Clone)]
|
||||
pub struct FlatProjectionMapper {
|
||||
/// Metadata of the region.
|
||||
metadata: RegionMetadataRef,
|
||||
@@ -240,6 +246,10 @@ impl FlatProjectionMapper {
|
||||
self.output_schema.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn with_output_schema(&mut self, output_schema: SchemaRef) {
|
||||
self.output_schema = output_schema;
|
||||
}
|
||||
|
||||
/// Returns an empty [RecordBatch].
|
||||
pub(crate) fn empty_record_batch(&self) -> RecordBatch {
|
||||
RecordBatch::new_empty(self.output_schema.clone())
|
||||
@@ -290,6 +300,13 @@ impl FlatProjectionMapper {
|
||||
array = casted;
|
||||
}
|
||||
}
|
||||
|
||||
let field = self.output_schema.arrow_schema().field(output_idx);
|
||||
if field.extension_type_name() == Some(JsonExtensionType::NAME) {
|
||||
array = JsonArray::from(&array)
|
||||
.try_align(field.data_type())
|
||||
.context(DataTypesSnafu)?;
|
||||
}
|
||||
arrays.push(array);
|
||||
}
|
||||
|
||||
|
||||
@@ -40,6 +40,7 @@ use crate::read::flat_projection::FlatProjectionMapper;
|
||||
pub(crate) const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
|
||||
|
||||
/// Wrapper enum for different projection mapper implementations.
|
||||
#[derive(Clone)]
|
||||
pub enum ProjectionMapper {
|
||||
/// Projection mapper for primary key format.
|
||||
PrimaryKey(PrimaryKeyProjectionMapper),
|
||||
@@ -148,6 +149,12 @@ impl ProjectionMapper {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn with_flat_output_schema(&mut self, output_schema: SchemaRef) {
|
||||
if let ProjectionMapper::Flat(m) = self {
|
||||
m.with_output_schema(output_schema)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty [RecordBatch].
|
||||
// TODO(yingwen): This is unused now. Use it after we finishing the flat format.
|
||||
pub fn empty_record_batch(&self) -> RecordBatch {
|
||||
@@ -159,6 +166,7 @@ impl ProjectionMapper {
|
||||
}
|
||||
|
||||
/// Handles projection and converts a projected [Batch] to a projected [RecordBatch].
|
||||
#[derive(Clone)]
|
||||
pub struct PrimaryKeyProjectionMapper {
|
||||
/// Metadata of the region.
|
||||
metadata: RegionMetadataRef,
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! Structs for partition ranges.
|
||||
|
||||
use common_time::Timestamp;
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use smallvec::{SmallVec, smallvec};
|
||||
use store_api::region_engine::PartitionRange;
|
||||
use store_api::storage::TimeSeriesDistribution;
|
||||
@@ -478,6 +479,11 @@ impl MemRangeBuilder {
|
||||
pub(crate) fn stats(&self) -> &MemtableStats {
|
||||
&self.stats
|
||||
}
|
||||
|
||||
/// Returns the record batch schema for this memtable range if available.
|
||||
pub(crate) fn record_batch_schema(&self) -> Option<SchemaRef> {
|
||||
self.range.record_batch_schema()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
//! Scans a region according to the scan request.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt;
|
||||
use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
@@ -27,11 +27,19 @@ use common_recordbatch::filter::SimpleFilterEvaluator;
|
||||
use common_telemetry::tracing::Instrument;
|
||||
use common_telemetry::{debug, error, tracing, warn};
|
||||
use common_time::range::TimestampRange;
|
||||
use datafusion::parquet::arrow::parquet_to_arrow_schema;
|
||||
use datafusion::physical_plan::expressions::DynamicFilterPhysicalExpr;
|
||||
use datafusion_common::Column;
|
||||
use datafusion_expr::Expr;
|
||||
use datafusion_expr::utils::expr_to_columns;
|
||||
use datatypes::arrow::datatypes::DataType as ArrowDataType;
|
||||
use datatypes::data_type::{ConcreteDataType, DataType};
|
||||
use datatypes::extension::json::is_json_extension_type;
|
||||
use datatypes::schema::Schema;
|
||||
use datatypes::schema::ext::ArrowSchemaExt;
|
||||
use datatypes::types::json_type;
|
||||
use futures::StreamExt;
|
||||
use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
|
||||
use partition::expr::PartitionExpr;
|
||||
use smallvec::SmallVec;
|
||||
use snafu::{OptionExt as _, ResultExt};
|
||||
@@ -45,9 +53,9 @@ use tokio::sync::{Semaphore, mpsc};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
use crate::access_layer::AccessLayerRef;
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::cache::{CacheStrategy, CachedSstMeta};
|
||||
use crate::config::{DEFAULT_MAX_CONCURRENT_SCAN_FILES, DEFAULT_SCAN_CHANNEL_SIZE};
|
||||
use crate::error::{InvalidPartitionExprSnafu, InvalidRequestSnafu, Result};
|
||||
use crate::error::{InvalidMetaSnafu, InvalidPartitionExprSnafu, InvalidRequestSnafu, Result};
|
||||
#[cfg(feature = "enterprise")]
|
||||
use crate::extension::{BoxedExtensionRange, BoxedExtensionRangeProvider};
|
||||
use crate::memtable::{MemtableRange, RangesOptions};
|
||||
@@ -75,7 +83,8 @@ use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBui
|
||||
#[cfg(feature = "vector_index")]
|
||||
use crate::sst::index::vector_index::applier::{VectorIndexApplier, VectorIndexApplierRef};
|
||||
use crate::sst::parquet::file_range::PreFilterMode;
|
||||
use crate::sst::parquet::reader::ReaderMetrics;
|
||||
use crate::sst::parquet::metadata::MetadataLoader;
|
||||
use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderMetrics};
|
||||
|
||||
/// Parallel scan channel size for flat format.
|
||||
const FLAT_SCAN_CHANNEL_SIZE: usize = 2;
|
||||
@@ -552,6 +561,7 @@ impl ScanRegion {
|
||||
.with_merge_mode(self.version.options.merge_mode())
|
||||
.with_series_row_selector(self.request.series_row_selector)
|
||||
.with_distribution(self.request.distribution)
|
||||
.with_json2_column_types(self.request.json2_column_types.clone())
|
||||
.with_flat_format(flat_format);
|
||||
#[cfg(feature = "vector_index")]
|
||||
let input = input
|
||||
@@ -568,6 +578,8 @@ impl ScanRegion {
|
||||
} else {
|
||||
input
|
||||
};
|
||||
|
||||
let input = concretize_json2_types(input).await?;
|
||||
Ok(input)
|
||||
}
|
||||
|
||||
@@ -794,6 +806,144 @@ impl ScanRegion {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn concretize_json2_types(input: ScanInput) -> Result<ScanInput> {
|
||||
let Some(output_schema) = input.mapper.as_flat().map(|x| x.output_schema()) else {
|
||||
return Ok(input);
|
||||
};
|
||||
let output_arrow_schema = output_schema.arrow_schema();
|
||||
if !output_arrow_schema.has_json_extension_field() {
|
||||
return Ok(input);
|
||||
}
|
||||
|
||||
let memtable_schemas = input
|
||||
.memtables
|
||||
.iter()
|
||||
.filter_map(|mem| mem.record_batch_schema())
|
||||
.collect::<Vec<_>>();
|
||||
let parquet_schemas = collect_parquet_record_batch_schemas(
|
||||
&input.files,
|
||||
&input.access_layer,
|
||||
&input.cache_strategy,
|
||||
)
|
||||
.await?;
|
||||
if memtable_schemas.is_empty()
|
||||
&& parquet_schemas.is_empty()
|
||||
// TODO(LFC): If we can concrete json2 type solely by query-driven hint, we can skip data-driven concretize.
|
||||
&& input.json2_column_types.is_empty()
|
||||
{
|
||||
return Ok(input);
|
||||
}
|
||||
|
||||
let mut column_schemas = output_schema.column_schemas().to_vec();
|
||||
let mut changed = false;
|
||||
for (idx, column_schema) in column_schemas.iter_mut().enumerate() {
|
||||
let output_field = &output_arrow_schema.fields()[idx];
|
||||
if !is_json_extension_type(output_field) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut merged = input
|
||||
.json2_column_types
|
||||
.get(&column_schema.name)
|
||||
.map(ConcreteDataType::as_arrow_type);
|
||||
for schema in &memtable_schemas {
|
||||
if let Some((_, field)) = schema.column_with_name(&column_schema.name) {
|
||||
merge_json_type_candidate(&mut merged, field.data_type());
|
||||
}
|
||||
}
|
||||
for schema in parquet_schemas.iter() {
|
||||
if let Some((_, field)) = schema.as_ref().column_with_name(&column_schema.name) {
|
||||
merge_json_type_candidate(&mut merged, field.data_type());
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(merged) = merged
|
||||
&& merged != *output_field.data_type()
|
||||
{
|
||||
column_schema.data_type = ConcreteDataType::from_arrow_type(&merged);
|
||||
common_telemetry::info!("merged type: {}", column_schema.data_type);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
if changed {
|
||||
let mut mapper = Arc::unwrap_or_clone(input.mapper);
|
||||
mapper.with_flat_output_schema(Arc::new(Schema::new(column_schemas)));
|
||||
Ok(ScanInput {
|
||||
mapper: Arc::new(mapper),
|
||||
..input
|
||||
})
|
||||
} else {
|
||||
Ok(input)
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_json_type_candidate(merged: &mut Option<ArrowDataType>, candidate: &ArrowDataType) {
|
||||
match merged {
|
||||
Some(current) => {
|
||||
*current = json_type::merge_as_json_type(current, candidate).into_owned();
|
||||
}
|
||||
None => {
|
||||
*merged = Some(candidate.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn collect_parquet_record_batch_schemas(
|
||||
files: &[FileHandle],
|
||||
access_layer: &AccessLayerRef,
|
||||
cache_strategy: &CacheStrategy,
|
||||
) -> Result<Vec<datatypes::arrow::datatypes::SchemaRef>> {
|
||||
let mut schemas = Vec::with_capacity(files.len());
|
||||
for file in files {
|
||||
let parquet_metadata =
|
||||
read_or_load_parquet_metadata(file, access_layer, cache_strategy).await?;
|
||||
let file_metadata = parquet_metadata.file_metadata();
|
||||
let arrow_schema = parquet_to_arrow_schema(
|
||||
file_metadata.schema_descr(),
|
||||
file_metadata.key_value_metadata(),
|
||||
)
|
||||
.map_err(|e| {
|
||||
InvalidMetaSnafu {
|
||||
reason: format!(
|
||||
"Failed to convert parquet metadata to arrow schema, file: {}, error: {e}",
|
||||
file.file_id()
|
||||
),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
if arrow_schema.has_json_extension_field() {
|
||||
schemas.push(Arc::new(arrow_schema));
|
||||
}
|
||||
}
|
||||
Ok(schemas)
|
||||
}
|
||||
|
||||
async fn read_or_load_parquet_metadata(
|
||||
file: &FileHandle,
|
||||
access_layer: &AccessLayerRef,
|
||||
cache_strategy: &CacheStrategy,
|
||||
) -> Result<Arc<ParquetMetaData>> {
|
||||
let mut metrics = MetadataCacheMetrics::default();
|
||||
if let Some(metadata) = cache_strategy
|
||||
.get_sst_meta_data(file.file_id(), &mut metrics, PageIndexPolicy::default())
|
||||
.await
|
||||
{
|
||||
return Ok(metadata.parquet_metadata());
|
||||
}
|
||||
|
||||
let file_path = file.file_path(access_layer.table_dir(), access_layer.path_type());
|
||||
let file_size = file.meta_ref().file_size;
|
||||
let metadata = MetadataLoader::new(access_layer.object_store().clone(), &file_path, file_size)
|
||||
.load(&mut metrics)
|
||||
.await
|
||||
.and_then(|x| CachedSstMeta::try_new(&file_path, x))
|
||||
.map(Arc::new)?;
|
||||
cache_strategy.put_sst_meta_data(file.file_id(), metadata.clone());
|
||||
|
||||
Ok(metadata.parquet_metadata())
|
||||
}
|
||||
|
||||
/// Returns true if the time range of a SST `file` matches the `predicate`.
|
||||
fn file_in_range(file: &FileHandle, predicate: &TimestampRange) -> bool {
|
||||
if predicate == &TimestampRange::min_to_max() {
|
||||
@@ -855,6 +1005,8 @@ pub struct ScanInput {
|
||||
pub(crate) series_row_selector: Option<TimeSeriesRowSelector>,
|
||||
/// Hint for the required distribution of the scanner.
|
||||
pub(crate) distribution: Option<TimeSeriesDistribution>,
|
||||
/// Query-driven target types for JSON2 columns.
|
||||
json2_column_types: HashMap<String, ConcreteDataType>,
|
||||
/// Whether to use flat format.
|
||||
pub(crate) flat_format: bool,
|
||||
/// Whether this scan is for compaction.
|
||||
@@ -893,6 +1045,7 @@ impl ScanInput {
|
||||
merge_mode: MergeMode::default(),
|
||||
series_row_selector: None,
|
||||
distribution: None,
|
||||
json2_column_types: HashMap::new(),
|
||||
flat_format: false,
|
||||
compaction: false,
|
||||
#[cfg(feature = "enterprise")]
|
||||
@@ -929,6 +1082,15 @@ impl ScanInput {
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
fn with_json2_column_types(
|
||||
mut self,
|
||||
json2_column_types: HashMap<String, ConcreteDataType>,
|
||||
) -> Self {
|
||||
self.json2_column_types = json2_column_types;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets cache for this query.
|
||||
#[must_use]
|
||||
pub(crate) fn with_cache(mut self, cache: CacheStrategy) -> Self {
|
||||
|
||||
@@ -91,6 +91,7 @@ pub struct FlatSchemaOptions {
|
||||
/// when storing primary key columns.
|
||||
/// Only takes effect when `raw_pk_columns` is true.
|
||||
pub string_pk_use_dict: bool,
|
||||
pub override_schema: Option<SchemaRef>,
|
||||
}
|
||||
|
||||
impl Default for FlatSchemaOptions {
|
||||
@@ -98,6 +99,7 @@ impl Default for FlatSchemaOptions {
|
||||
Self {
|
||||
raw_pk_columns: true,
|
||||
string_pk_use_dict: true,
|
||||
override_schema: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -111,6 +113,7 @@ impl FlatSchemaOptions {
|
||||
Self {
|
||||
raw_pk_columns: false,
|
||||
string_pk_use_dict: false,
|
||||
override_schema: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -131,7 +134,20 @@ pub fn to_flat_sst_arrow_schema(
|
||||
) -> SchemaRef {
|
||||
let num_fields = flat_sst_arrow_schema_column_num(metadata, options);
|
||||
let mut fields = Vec::with_capacity(num_fields);
|
||||
let schema = metadata.schema.arrow_schema();
|
||||
|
||||
let mut schema = metadata.schema.arrow_schema().clone();
|
||||
if let Some(override_schema) = &options.override_schema {
|
||||
let mut fields = Vec::with_capacity(schema.fields().len());
|
||||
for field in schema.fields() {
|
||||
if let Some((_, override_field)) = override_schema.fields().find(field.name()) {
|
||||
fields.push(override_field.clone());
|
||||
} else {
|
||||
fields.push(field.clone());
|
||||
}
|
||||
}
|
||||
schema = Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone()));
|
||||
};
|
||||
|
||||
if options.raw_pk_columns {
|
||||
for pk_id in &metadata.primary_key {
|
||||
let pk_index = metadata.column_index_by_id(*pk_id).unwrap();
|
||||
|
||||
@@ -1382,6 +1382,7 @@ mod tests {
|
||||
bloom_filter_index_config: Default::default(),
|
||||
#[cfg(feature = "vector_index")]
|
||||
vector_index_config: Default::default(),
|
||||
schema: None,
|
||||
};
|
||||
let mut metrics = Metrics::new(WriteType::Flush);
|
||||
env.access_layer
|
||||
|
||||
@@ -1247,7 +1247,7 @@ mod tests {
|
||||
.await;
|
||||
|
||||
writer
|
||||
.write_all_flat(flat_source, None, write_opts)
|
||||
.write_all_flat(flat_source, None, None, write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.remove(0)
|
||||
@@ -1358,7 +1358,7 @@ mod tests {
|
||||
.await;
|
||||
|
||||
let info = writer
|
||||
.write_all_flat(flat_source, None, &write_opts)
|
||||
.write_all_flat(flat_source, None, None, &write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.remove(0);
|
||||
|
||||
@@ -49,7 +49,7 @@ use store_api::storage::{ColumnId, SequenceNumber};
|
||||
|
||||
use crate::error::{
|
||||
ComputeArrowSnafu, DecodeSnafu, InvalidParquetSnafu, InvalidRecordBatchSnafu,
|
||||
NewRecordBatchSnafu, Result,
|
||||
NewRecordBatchSnafu, RecordBatchSnafu, Result,
|
||||
};
|
||||
use crate::sst::parquet::format::{
|
||||
FIXED_POS_COLUMN_NUM, FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray,
|
||||
@@ -103,6 +103,11 @@ impl FlatWriteFormat {
|
||||
let sequence_array = Arc::new(UInt64Array::from(vec![override_sequence; batch.num_rows()]));
|
||||
columns[sequence_column_index(batch.num_columns())] = sequence_array;
|
||||
|
||||
let columns = common_recordbatch::recordbatch::maybe_align_json_array_with_schema(
|
||||
&self.arrow_schema,
|
||||
columns,
|
||||
)
|
||||
.context(RecordBatchSnafu)?;
|
||||
RecordBatch::try_new(self.arrow_schema.clone(), columns).context(NewRecordBatchSnafu)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -456,8 +456,7 @@ impl ParquetReaderBuilder {
|
||||
.unwrap_or_else(|| region_meta.schema.clone());
|
||||
|
||||
// Create ArrowReaderMetadata for async stream building.
|
||||
let arrow_reader_options =
|
||||
ArrowReaderOptions::new().with_schema(read_format.arrow_schema().clone());
|
||||
let arrow_reader_options = ArrowReaderOptions::new();
|
||||
let arrow_metadata =
|
||||
ArrowReaderMetadata::try_new(parquet_meta.clone(), arrow_reader_options)
|
||||
.context(ReadDataPartSnafu)?;
|
||||
|
||||
@@ -72,6 +72,7 @@ enum FlatBatchConverter {
|
||||
}
|
||||
|
||||
impl FlatBatchConverter {
|
||||
#[expect(unused)]
|
||||
fn arrow_schema(&self) -> &SchemaRef {
|
||||
match self {
|
||||
FlatBatchConverter::Flat(f) => f.arrow_schema(),
|
||||
@@ -275,15 +276,16 @@ where
|
||||
pub async fn write_all_flat(
|
||||
&mut self,
|
||||
source: FlatSource,
|
||||
override_schema: Option<SchemaRef>,
|
||||
override_sequence: Option<SequenceNumber>,
|
||||
opts: &WriteOptions,
|
||||
) -> Result<SstInfoArray> {
|
||||
let mut options = FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding);
|
||||
options.override_schema = override_schema;
|
||||
|
||||
let converter = FlatBatchConverter::Flat(
|
||||
FlatWriteFormat::new(
|
||||
self.metadata.clone(),
|
||||
&FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding),
|
||||
)
|
||||
.with_override_sequence(override_sequence),
|
||||
FlatWriteFormat::new(self.metadata.clone(), &options)
|
||||
.with_override_sequence(override_sequence),
|
||||
);
|
||||
let res = self.write_all_flat_inner(source, &converter, opts).await;
|
||||
if res.is_err() {
|
||||
@@ -406,7 +408,7 @@ where
|
||||
let arrow_batch = converter.convert_batch(&record_batch)?;
|
||||
|
||||
let start = Instant::now();
|
||||
self.maybe_init_writer(converter.arrow_schema(), opts)
|
||||
self.maybe_init_writer(arrow_batch.schema_ref(), opts)
|
||||
.await?
|
||||
.write(&arrow_batch)
|
||||
.await
|
||||
|
||||
@@ -301,12 +301,22 @@ impl<'a, 'b> JsonColumnTypeUpdater<'a, 'b> {
|
||||
.or_insert_with(|| value_type.clone());
|
||||
|
||||
if !merged_type.is_include(&value_type) {
|
||||
merged_type.merge(&value_type).map_err(|e| {
|
||||
if column_schema
|
||||
.data_type
|
||||
.as_json()
|
||||
.map(|x| x.is_native_type())
|
||||
.unwrap_or(false)
|
||||
{
|
||||
merged_type.merge(&value_type)
|
||||
} else {
|
||||
merged_type.merge_with_lifting(&value_type)
|
||||
}
|
||||
.map_err(|e| {
|
||||
InvalidInsertRequestSnafu {
|
||||
reason: format!(r#"cannot merge "{value_type}" into "{merged_type}": {e}"#),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
})?
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -323,7 +333,17 @@ impl<'a, 'b> JsonColumnTypeUpdater<'a, 'b> {
|
||||
for (column_name, merged_type) in self.merged_value_types.iter() {
|
||||
let Some(column_type) = insert_columns
|
||||
.iter()
|
||||
.find_map(|x| (&x.name == column_name).then(|| x.data_type.as_json()))
|
||||
.find_map(|x| {
|
||||
(&x.name == column_name).then(|| {
|
||||
if let ConcreteDataType::Json(t) = &x.data_type
|
||||
&& t.is_native_type()
|
||||
{
|
||||
Some(t)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
})
|
||||
.flatten()
|
||||
else {
|
||||
continue;
|
||||
|
||||
@@ -418,7 +418,7 @@ fn resolve_schema(
|
||||
match (column_type, value_type) {
|
||||
(column_type, value_type) if column_type == value_type => Ok(()),
|
||||
(ConcreteDataType::Json(column_type), ConcreteDataType::Json(value_type))
|
||||
if column_type.is_include(value_type) =>
|
||||
if column_type.is_json2() =>
|
||||
{
|
||||
Ok(())
|
||||
}
|
||||
@@ -689,17 +689,16 @@ fn resolve_value(
|
||||
}
|
||||
|
||||
VrlValue::Array(_) | VrlValue::Object(_) => {
|
||||
let is_json_native_type = schema_info
|
||||
let is_json2_type = schema_info
|
||||
.find_column_schema_in_table(&column_name)
|
||||
.is_some_and(|x| {
|
||||
if let ConcreteDataType::Json(column_type) = &x.column_schema.data_type {
|
||||
column_type.is_native_type()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
matches!(
|
||||
&x.column_schema.data_type,
|
||||
ConcreteDataType::Json(column_type) if column_type.is_json2()
|
||||
)
|
||||
});
|
||||
|
||||
let value = if is_json_native_type {
|
||||
let value = if is_json2_type {
|
||||
let json_extension_type: Option<JsonExtensionType> =
|
||||
if let Some(x) = schema_info.find_column_schema_in_table(&column_name) {
|
||||
x.column_schema.extension_type()?
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! Planner, QueryEngine implementations based on DataFusion.
|
||||
|
||||
mod error;
|
||||
mod json2_expr_planner;
|
||||
mod planner;
|
||||
|
||||
use std::any::Any;
|
||||
|
||||
127
src/query/src/datafusion/json2_expr_planner.rs
Normal file
127
src/query/src/datafusion/json2_expr_planner.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::Field;
|
||||
use arrow_schema::extension::ExtensionType;
|
||||
use common_function::scalars::json::json2_get::{Json2GetFunction, datatype_expr};
|
||||
use common_function::scalars::udf::create_udf;
|
||||
use datafusion_common::arrow::datatypes::DataType;
|
||||
use datafusion_common::{Column, DataFusionError, Result, ScalarValue, TableReference};
|
||||
use datafusion_expr::expr::{BinaryExpr, ScalarFunction};
|
||||
use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawBinaryExpr};
|
||||
use datafusion_expr::{Expr, ExprSchemable, Operator};
|
||||
use datatypes::extension::json::JsonExtensionType;
|
||||
use sqlparser::ast::BinaryOperator;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Json2ExprPlanner;
|
||||
|
||||
fn json2_get(base: Expr, path: String) -> Result<Expr> {
|
||||
let args = vec![
|
||||
base,
|
||||
Expr::Literal(ScalarValue::Utf8(Some(path)), None),
|
||||
datatype_expr(&DataType::Utf8View)?,
|
||||
];
|
||||
let function = create_udf(Arc::new(Json2GetFunction::default()));
|
||||
Ok(Expr::ScalarFunction(ScalarFunction::new_udf(
|
||||
Arc::new(function),
|
||||
args,
|
||||
)))
|
||||
}
|
||||
|
||||
impl ExprPlanner for Json2ExprPlanner {
|
||||
fn plan_binary_op(
|
||||
&self,
|
||||
expr: RawBinaryExpr,
|
||||
schema: &datafusion_common::DFSchema,
|
||||
) -> Result<PlannerResult<RawBinaryExpr>> {
|
||||
let Some(operator) = parse_sql_binary_op(&expr.op) else {
|
||||
return Ok(PlannerResult::Original(expr));
|
||||
};
|
||||
|
||||
let left_type = expr.left.get_type(schema)?;
|
||||
let right_type = expr.right.get_type(schema)?;
|
||||
let left_rewritten = rewrite_expr_json2_get(&expr.left, right_type)?;
|
||||
let right_rewritten = rewrite_expr_json2_get(&expr.right, left_type)?;
|
||||
if left_rewritten.is_none() && right_rewritten.is_none() {
|
||||
return Ok(PlannerResult::Original(expr));
|
||||
}
|
||||
|
||||
let rewritten = Expr::BinaryExpr(BinaryExpr::new(
|
||||
Box::new(left_rewritten.unwrap_or(expr.left)),
|
||||
operator,
|
||||
Box::new(right_rewritten.unwrap_or(expr.right)),
|
||||
));
|
||||
common_telemetry::debug!("json2 plan_binary_op: rewritten={rewritten:?}");
|
||||
Ok(PlannerResult::Planned(rewritten))
|
||||
}
|
||||
|
||||
fn plan_compound_identifier(
|
||||
&self,
|
||||
field: &Field,
|
||||
qualifier: Option<&TableReference>,
|
||||
nested_names: &[String],
|
||||
) -> Result<PlannerResult<Vec<Expr>>> {
|
||||
if field.extension_type_name() != Some(JsonExtensionType::NAME) {
|
||||
return Ok(PlannerResult::Original(Vec::new()));
|
||||
}
|
||||
|
||||
let path = nested_names.join(".");
|
||||
let column = Column::from((qualifier, field));
|
||||
json2_get(Expr::Column(column), path).map(PlannerResult::Planned)
|
||||
}
|
||||
}
|
||||
|
||||
fn rewrite_expr_json2_get(expr: &Expr, data_type: DataType) -> Result<Option<Expr>> {
|
||||
let Expr::ScalarFunction(func) = expr else {
|
||||
return Ok(None);
|
||||
};
|
||||
if func.func.name() != Json2GetFunction::NAME {
|
||||
return Ok(None);
|
||||
}
|
||||
if func.args.len() != 3 {
|
||||
return Err(DataFusionError::Internal(format!(
|
||||
"Function {} is expected to have 3 arguments!",
|
||||
func.name()
|
||||
)));
|
||||
}
|
||||
|
||||
let expected_expr = datatype_expr(&data_type)?;
|
||||
let rewritten = Expr::ScalarFunction(ScalarFunction {
|
||||
func: func.func.clone(),
|
||||
args: vec![func.args[0].clone(), func.args[1].clone(), expected_expr],
|
||||
});
|
||||
Ok(Some(rewritten))
|
||||
}
|
||||
|
||||
fn parse_sql_binary_op(op: &BinaryOperator) -> Option<Operator> {
|
||||
match *op {
|
||||
BinaryOperator::Gt => Some(Operator::Gt),
|
||||
BinaryOperator::GtEq => Some(Operator::GtEq),
|
||||
BinaryOperator::Lt => Some(Operator::Lt),
|
||||
BinaryOperator::LtEq => Some(Operator::LtEq),
|
||||
BinaryOperator::Eq => Some(Operator::Eq),
|
||||
BinaryOperator::NotEq => Some(Operator::NotEq),
|
||||
BinaryOperator::Plus => Some(Operator::Plus),
|
||||
BinaryOperator::Minus => Some(Operator::Minus),
|
||||
BinaryOperator::Multiply => Some(Operator::Multiply),
|
||||
BinaryOperator::Divide => Some(Operator::Divide),
|
||||
BinaryOperator::Modulo => Some(Operator::Modulo),
|
||||
BinaryOperator::And => Some(Operator::And),
|
||||
BinaryOperator::Or => Some(Operator::Or),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -38,6 +38,7 @@ use datafusion_sql::parser::Statement as DfStatement;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::{Location, ResultExt};
|
||||
|
||||
use crate::datafusion::json2_expr_planner::Json2ExprPlanner;
|
||||
use crate::error::{CatalogSnafu, Result};
|
||||
use crate::query_engine::{DefaultPlanDecoder, QueryEngineState};
|
||||
|
||||
@@ -87,6 +88,9 @@ impl DfContextProviderAdapter {
|
||||
.map(|format| (format.get_ext().to_lowercase(), format))
|
||||
.collect();
|
||||
|
||||
let mut expr_planners = SessionStateDefaults::default_expr_planners();
|
||||
expr_planners.insert(0, Arc::new(Json2ExprPlanner));
|
||||
|
||||
Ok(Self {
|
||||
engine_state,
|
||||
session_state,
|
||||
@@ -94,7 +98,7 @@ impl DfContextProviderAdapter {
|
||||
table_provider,
|
||||
query_ctx,
|
||||
file_formats,
|
||||
expr_planners: SessionStateDefaults::default_expr_planners(),
|
||||
expr_planners,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! Dummy catalog for region server.
|
||||
|
||||
use std::any::Any;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
@@ -30,6 +31,7 @@ use datafusion::physical_plan::ExecutionPlan;
|
||||
use datafusion_common::DataFusionError;
|
||||
use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use futures::stream::BoxStream;
|
||||
use session::context::{QueryContext, QueryContextRef};
|
||||
use snafu::ResultExt;
|
||||
@@ -266,6 +268,10 @@ impl DummyTableProvider {
|
||||
self.scan_request.lock().unwrap().vector_search.clone()
|
||||
}
|
||||
|
||||
pub fn with_json2_type_hint(&self, json2_column_types: &HashMap<String, ConcreteDataType>) {
|
||||
self.scan_request.lock().unwrap().json2_column_types = json2_column_types.clone();
|
||||
}
|
||||
|
||||
pub fn with_sequence(&self, sequence: u64) {
|
||||
self.scan_request.lock().unwrap().memtable_max_sequence = Some(sequence);
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
pub mod constant_term;
|
||||
pub mod count_nest_aggr;
|
||||
pub mod count_wildcard;
|
||||
pub mod json2_scan_hint;
|
||||
pub mod parallelize_scan;
|
||||
pub mod pass_distribution;
|
||||
pub mod remove_duplicate;
|
||||
|
||||
225
src/query/src/optimizer/json2_scan_hint.rs
Normal file
225
src/query/src/optimizer/json2_scan_hint.rs
Normal file
@@ -0,0 +1,225 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use common_function::scalars::json::json2_get::Json2GetFunction;
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
|
||||
use datafusion_common::{Result, ScalarValue, TableReference, internal_err};
|
||||
use datafusion_expr::expr::ScalarFunction;
|
||||
use datafusion_expr::{Expr, LogicalPlan};
|
||||
use datafusion_optimizer::{OptimizerConfig, OptimizerRule};
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::json::requirement::JsonPathTarget;
|
||||
use datatypes::types::JsonFormat;
|
||||
|
||||
use crate::dummy_catalog::DummyTableProvider;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Json2ScanHintRule;
|
||||
|
||||
impl OptimizerRule for Json2ScanHintRule {
|
||||
fn name(&self) -> &str {
|
||||
"Json2ScanHintRule"
|
||||
}
|
||||
|
||||
fn rewrite(
|
||||
&self,
|
||||
plan: LogicalPlan,
|
||||
_config: &dyn OptimizerConfig,
|
||||
) -> Result<Transformed<LogicalPlan>> {
|
||||
let requirements = Json2TypeRequirements::collect(&plan)?;
|
||||
if requirements.is_empty() {
|
||||
return Ok(Transformed::no(plan));
|
||||
}
|
||||
|
||||
plan.transform_down(&mut |plan| match &plan {
|
||||
LogicalPlan::TableScan(table_scan) => {
|
||||
let Some(source) = table_scan
|
||||
.source
|
||||
.as_any()
|
||||
.downcast_ref::<DefaultTableSource>()
|
||||
else {
|
||||
return Ok(Transformed::no(plan));
|
||||
};
|
||||
|
||||
let Some(adapter) = source
|
||||
.table_provider
|
||||
.as_any()
|
||||
.downcast_ref::<DummyTableProvider>()
|
||||
else {
|
||||
return Ok(Transformed::no(plan));
|
||||
};
|
||||
|
||||
let hints =
|
||||
requirements.merge(&table_scan.table_name, &adapter.region_metadata().schema);
|
||||
adapter.with_json2_type_hint(&hints);
|
||||
Ok(Transformed::yes(plan))
|
||||
}
|
||||
_ => Ok(Transformed::no(plan)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
struct Json2ColumnKey {
|
||||
relation: Option<TableReference>,
|
||||
name: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct Json2TypeRequirements {
|
||||
path_targets: HashMap<Json2ColumnKey, JsonPathTarget>,
|
||||
}
|
||||
|
||||
impl Json2TypeRequirements {
|
||||
fn collect(plan: &LogicalPlan) -> Result<Self> {
|
||||
let mut collector = Self::default();
|
||||
plan.apply(|node| {
|
||||
for expr in node.expressions() {
|
||||
let _ = expr.apply(|expr| {
|
||||
if let Some((column, path, data_type)) = extract_json2_get(expr)? {
|
||||
collector
|
||||
.path_targets
|
||||
.entry(column)
|
||||
.or_default()
|
||||
.require_typed_path(&path, data_type);
|
||||
}
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
})?;
|
||||
}
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
})?;
|
||||
Ok(collector)
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.path_targets.is_empty()
|
||||
}
|
||||
|
||||
fn merge(
|
||||
&self,
|
||||
table_name: &TableReference,
|
||||
schema: &datatypes::schema::SchemaRef,
|
||||
) -> HashMap<String, ConcreteDataType> {
|
||||
let mut types = HashMap::new();
|
||||
|
||||
for column_schema in schema.column_schemas() {
|
||||
let ConcreteDataType::Json(json_type) = &column_schema.data_type else {
|
||||
continue;
|
||||
};
|
||||
if !matches!(json_type.format, JsonFormat::Json2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let matching_keys = self
|
||||
.path_targets
|
||||
.iter()
|
||||
.filter(|(key, _)| {
|
||||
key.name == column_schema.name
|
||||
&& key.relation.as_ref().is_none_or(|x| x == table_name)
|
||||
})
|
||||
.map(|(_, target)| target.clone())
|
||||
.collect::<Vec<_>>();
|
||||
if matching_keys.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut merged = JsonPathTarget::default();
|
||||
for target in matching_keys {
|
||||
if let Some(data_type) = target.build_type() {
|
||||
merge_path_target_from_type(&mut merged, &data_type, "");
|
||||
}
|
||||
}
|
||||
if let Some(data_type) = merged.build_type() {
|
||||
let _ = types.insert(column_schema.name.clone(), data_type);
|
||||
}
|
||||
}
|
||||
|
||||
types
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_json2_get(expr: &Expr) -> Result<Option<(Json2ColumnKey, String, ConcreteDataType)>> {
|
||||
let Expr::ScalarFunction(ScalarFunction { func, args }) = expr else {
|
||||
return Ok(None);
|
||||
};
|
||||
if func.name() != Json2GetFunction::NAME {
|
||||
return Ok(None);
|
||||
}
|
||||
if args.len() != 3 {
|
||||
return internal_err!("function {} must have 3 arguments", Json2GetFunction::NAME);
|
||||
}
|
||||
|
||||
let Expr::Column(column) = &args[0] else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let path = match &args[1] {
|
||||
Expr::Literal(ScalarValue::Utf8(Some(path)), _)
|
||||
| Expr::Literal(ScalarValue::LargeUtf8(Some(path)), _)
|
||||
| Expr::Literal(ScalarValue::Utf8View(Some(path)), _) => path.clone(),
|
||||
_ => return Ok(None),
|
||||
};
|
||||
|
||||
let data_type = args
|
||||
.get(2)
|
||||
.and_then(extract_expected_type)
|
||||
.unwrap_or_else(ConcreteDataType::string_datatype);
|
||||
|
||||
Ok(Some((
|
||||
Json2ColumnKey {
|
||||
relation: column.relation.clone(),
|
||||
name: column.name.clone(),
|
||||
},
|
||||
path,
|
||||
data_type,
|
||||
)))
|
||||
}
|
||||
|
||||
fn extract_expected_type(expr: &Expr) -> Option<ConcreteDataType> {
|
||||
match expr {
|
||||
Expr::Literal(value, _) => {
|
||||
let data_type = value.data_type();
|
||||
Some(ConcreteDataType::from_arrow_type(&data_type))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_path_target_from_type(
|
||||
target: &mut JsonPathTarget,
|
||||
data_type: &ConcreteDataType,
|
||||
prefix: &str,
|
||||
) {
|
||||
match data_type {
|
||||
ConcreteDataType::Struct(struct_type) => {
|
||||
let fields = struct_type.fields();
|
||||
for field in fields.iter() {
|
||||
let path = if prefix.is_empty() {
|
||||
field.name().to_string()
|
||||
} else {
|
||||
format!("{prefix}.{}", field.name())
|
||||
};
|
||||
merge_path_target_from_type(target, field.data_type(), &path);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if !prefix.is_empty() {
|
||||
target.require_typed_path(prefix, data_type.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -62,6 +62,7 @@ use crate::optimizer::ExtensionAnalyzerRule;
|
||||
use crate::optimizer::constant_term::MatchesConstantTermOptimizer;
|
||||
use crate::optimizer::count_nest_aggr::CountNestAggrRule;
|
||||
use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
|
||||
use crate::optimizer::json2_scan_hint::Json2ScanHintRule;
|
||||
use crate::optimizer::parallelize_scan::ParallelizeScan;
|
||||
use crate::optimizer::pass_distribution::PassDistribution;
|
||||
use crate::optimizer::remove_duplicate::RemoveDuplicate;
|
||||
@@ -173,6 +174,7 @@ impl QueryEngineState {
|
||||
analyzer.rules.push(Arc::new(FixStateUdafOrderingAnalyzer));
|
||||
|
||||
let mut optimizer = Optimizer::new();
|
||||
optimizer.rules.push(Arc::new(Json2ScanHintRule));
|
||||
optimizer.rules.push(Arc::new(ScanHintRule));
|
||||
|
||||
// add physical optimizer
|
||||
|
||||
@@ -153,7 +153,16 @@ pub fn column_to_schema(
|
||||
|
||||
column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some());
|
||||
|
||||
if matches!(column.data_type(), SqlDataType::JSON) {
|
||||
let is_json2_column = if let SqlDataType::Custom(object_name, _) = column.data_type() {
|
||||
object_name
|
||||
.0
|
||||
.first()
|
||||
.map(|x| x.to_string_unquoted().eq_ignore_ascii_case("JSON2"))
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if is_json2_column || matches!(column.data_type(), SqlDataType::JSON) {
|
||||
let settings = column
|
||||
.extensions
|
||||
.build_json_structure_settings()?
|
||||
@@ -290,22 +299,25 @@ pub fn sql_data_type_to_concrete_data_type(
|
||||
};
|
||||
Ok(ConcreteDataType::Json(JsonType::new(format)))
|
||||
}
|
||||
// Vector type
|
||||
SqlDataType::Custom(name, d)
|
||||
if name.0.as_slice().len() == 1
|
||||
&& name.0.as_slice()[0]
|
||||
.to_string_unquoted()
|
||||
.to_ascii_uppercase()
|
||||
== VECTOR_TYPE_NAME
|
||||
&& d.len() == 1 =>
|
||||
{
|
||||
let dim = d[0].parse().map_err(|e| {
|
||||
error::ParseSqlValueSnafu {
|
||||
msg: format!("Failed to parse vector dimension: {}", e),
|
||||
// Vector type and JSON2 type
|
||||
SqlDataType::Custom(name, d) if name.0.len() == 1 => {
|
||||
let name = name.0[0].to_string_unquoted().to_ascii_uppercase();
|
||||
match name.as_str() {
|
||||
VECTOR_TYPE_NAME if d.len() == 1 => {
|
||||
let dim = d[0].parse().map_err(|e| {
|
||||
error::ParseSqlValueSnafu {
|
||||
msg: format!(r#"Failed to parse vector dimension "{}": {}"#, d[0], e),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
Ok(ConcreteDataType::vector_datatype(dim))
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
Ok(ConcreteDataType::vector_datatype(dim))
|
||||
"JSON2" => Ok(ConcreteDataType::Json(JsonType::new(JsonFormat::Json2))),
|
||||
_ => error::SqlTypeNotSupportedSnafu {
|
||||
t: data_type.clone(),
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
_ => error::SqlTypeNotSupportedSnafu {
|
||||
t: data_type.clone(),
|
||||
|
||||
@@ -377,32 +377,35 @@ impl ColumnExtensions {
|
||||
None
|
||||
};
|
||||
|
||||
options
|
||||
let format = options
|
||||
.get(JSON_OPT_FORMAT)
|
||||
.map(|format| match format {
|
||||
JSON_FORMAT_FULL_STRUCTURED => Ok(JsonStructureSettings::Structured(fields)),
|
||||
JSON_FORMAT_PARTIAL => {
|
||||
let fields = fields.map(|fields| {
|
||||
let mut fields = Arc::unwrap_or_clone(fields.fields());
|
||||
fields.push(datatypes::types::StructField::new(
|
||||
JsonStructureSettings::RAW_FIELD.to_string(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
));
|
||||
StructType::new(Arc::new(fields))
|
||||
});
|
||||
Ok(JsonStructureSettings::PartialUnstructuredByKey {
|
||||
fields,
|
||||
unstructured_keys,
|
||||
})
|
||||
.unwrap_or(JSON_FORMAT_FULL_STRUCTURED);
|
||||
let settings = match format {
|
||||
JSON_FORMAT_FULL_STRUCTURED => JsonStructureSettings::Structured(fields),
|
||||
JSON_FORMAT_PARTIAL => {
|
||||
let fields = fields.map(|fields| {
|
||||
let mut fields = Arc::unwrap_or_clone(fields.fields());
|
||||
fields.push(datatypes::types::StructField::new(
|
||||
JsonStructureSettings::RAW_FIELD.to_string(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
));
|
||||
StructType::new(Arc::new(fields))
|
||||
});
|
||||
JsonStructureSettings::PartialUnstructuredByKey {
|
||||
fields,
|
||||
unstructured_keys,
|
||||
}
|
||||
JSON_FORMAT_RAW => Ok(JsonStructureSettings::UnstructuredRaw),
|
||||
_ => InvalidSqlSnafu {
|
||||
}
|
||||
JSON_FORMAT_RAW => JsonStructureSettings::UnstructuredRaw,
|
||||
_ => {
|
||||
return InvalidSqlSnafu {
|
||||
msg: format!("unknown JSON datatype 'format': {format}"),
|
||||
}
|
||||
.fail(),
|
||||
})
|
||||
.transpose()
|
||||
.fail();
|
||||
}
|
||||
};
|
||||
Ok(Some(settings))
|
||||
}
|
||||
|
||||
pub fn set_json_structure_settings(&mut self, settings: JsonStructureSettings) {
|
||||
|
||||
@@ -12,12 +12,14 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::{Display, Formatter};
|
||||
|
||||
use common_error::ext::BoxedError;
|
||||
use common_recordbatch::OrderOption;
|
||||
use datafusion_expr::expr::Expr;
|
||||
// Re-export vector types from datatypes to avoid duplication
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
pub use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
|
||||
use strum::Display;
|
||||
|
||||
@@ -130,6 +132,8 @@ pub struct ScanRequest {
|
||||
pub vector_search: Option<VectorSearchRequest>,
|
||||
/// Whether to force reading region data in flat format.
|
||||
pub force_flat_format: bool,
|
||||
/// Optional target types for query-driven JSON2 concretization.
|
||||
pub json2_column_types: HashMap<String, ConcreteDataType>,
|
||||
}
|
||||
|
||||
impl Display for ScanRequest {
|
||||
@@ -228,6 +232,14 @@ impl Display for ScanRequest {
|
||||
self.force_flat_format
|
||||
)?;
|
||||
}
|
||||
if !self.json2_column_types.is_empty() {
|
||||
write!(
|
||||
f,
|
||||
"{}json2_column_types: {:?}",
|
||||
delimiter.as_str(),
|
||||
self.json2_column_types
|
||||
)?;
|
||||
}
|
||||
write!(f, " }}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ autotests = false
|
||||
|
||||
[[test]]
|
||||
name = "main"
|
||||
path = "tests/main.rs"
|
||||
path = "tests/it/main.rs"
|
||||
|
||||
[features]
|
||||
dashboard = ["servers/dashboard"]
|
||||
|
||||
@@ -149,16 +149,10 @@ async fn query_data(frontend: &Arc<Instance>) -> io::Result<()> {
|
||||
+----------+"#;
|
||||
execute_sql_and_expect(frontend, sql, expected).await;
|
||||
|
||||
let sql = "SELECT * FROM bluesky ORDER BY time_us";
|
||||
let expected = fs::read_to_string(find_workspace_path(
|
||||
"tests-integration/resources/jsonbench-select-all.txt",
|
||||
))?;
|
||||
execute_sql_and_expect(frontend, sql, &expected).await;
|
||||
|
||||
// query 1:
|
||||
let sql = "
|
||||
SELECT
|
||||
json_get_string(data, '$.commit.collection') AS event, count() AS count
|
||||
data.commit.collection AS event, count() AS count
|
||||
FROM bluesky
|
||||
GROUP BY event
|
||||
ORDER BY count DESC, event ASC";
|
||||
@@ -176,13 +170,12 @@ ORDER BY count DESC, event ASC";
|
||||
// query 2:
|
||||
let sql = "
|
||||
SELECT
|
||||
json_get_string(data, '$.commit.collection') AS event,
|
||||
data.commit.collection AS event,
|
||||
count() AS count,
|
||||
count(DISTINCT json_get_string(data, '$.did')) AS users
|
||||
count(DISTINCT data.did) AS users
|
||||
FROM bluesky
|
||||
WHERE
|
||||
(json_get_string(data, '$.kind') = 'commit') AND
|
||||
(json_get_string(data, '$.commit.operation') = 'create')
|
||||
data.kind = 'commit' AND data.commit.operation = 'create'
|
||||
GROUP BY event
|
||||
ORDER BY count DESC, event ASC";
|
||||
let expected = r#"
|
||||
@@ -199,15 +192,14 @@ ORDER BY count DESC, event ASC";
|
||||
// query 3:
|
||||
let sql = "
|
||||
SELECT
|
||||
json_get_string(data, '$.commit.collection') AS event,
|
||||
date_part('hour', to_timestamp_micros(json_get_int(data, '$.time_us'))) as hour_of_day,
|
||||
data.commit.collection AS event,
|
||||
date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day,
|
||||
count() AS count
|
||||
FROM bluesky
|
||||
WHERE
|
||||
(json_get_string(data, '$.kind') = 'commit') AND
|
||||
(json_get_string(data, '$.commit.operation') = 'create') AND
|
||||
json_get_string(data, '$.commit.collection') IN
|
||||
('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
|
||||
data.kind = 'commit' AND
|
||||
data.commit.operation = 'create' AND
|
||||
data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
|
||||
GROUP BY event, hour_of_day
|
||||
ORDER BY hour_of_day, event";
|
||||
let expected = r#"
|
||||
@@ -223,13 +215,13 @@ ORDER BY hour_of_day, event";
|
||||
// query 4:
|
||||
let sql = "
|
||||
SELECT
|
||||
json_get_string(data, '$.did') as user_id,
|
||||
min(to_timestamp_micros(json_get_int(data, '$.time_us'))) AS first_post_ts
|
||||
data.did::String as user_id,
|
||||
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
|
||||
FROM bluesky
|
||||
WHERE
|
||||
(json_get_string(data, '$.kind') = 'commit') AND
|
||||
(json_get_string(data, '$.commit.operation') = 'create') AND
|
||||
(json_get_string(data, '$.commit.collection') = 'app.bsky.feed.post')
|
||||
data.kind = 'commit' AND
|
||||
data.commit.operation = 'create' AND
|
||||
data.commit.collection = 'app.bsky.feed.post'
|
||||
GROUP BY user_id
|
||||
ORDER BY first_post_ts ASC, user_id DESC
|
||||
LIMIT 3";
|
||||
@@ -246,17 +238,17 @@ LIMIT 3";
|
||||
// query 5:
|
||||
let sql = "
|
||||
SELECT
|
||||
json_get_string(data, '$.did') as user_id,
|
||||
data.did::String as user_id,
|
||||
date_part(
|
||||
'epoch',
|
||||
max(to_timestamp_micros(json_get_int(data, '$.time_us'))) -
|
||||
min(to_timestamp_micros(json_get_int(data, '$.time_us')))
|
||||
max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) -
|
||||
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))
|
||||
) AS activity_span
|
||||
FROM bluesky
|
||||
WHERE
|
||||
(json_get_string(data, '$.kind') = 'commit') AND
|
||||
(json_get_string(data, '$.commit.operation') = 'create') AND
|
||||
(json_get_string(data, '$.commit.collection') = 'app.bsky.feed.post')
|
||||
data.kind = 'commit' AND
|
||||
data.commit.operation = 'create' AND
|
||||
data.commit.collection = 'app.bsky.feed.post'
|
||||
GROUP BY user_id
|
||||
ORDER BY activity_span DESC, user_id DESC
|
||||
LIMIT 3";
|
||||
@@ -300,30 +292,21 @@ async fn insert_data_by_sql(frontend: &Arc<Instance>) -> io::Result<()> {
|
||||
async fn desc_table(frontend: &Arc<Instance>) {
|
||||
let sql = "DESC TABLE bluesky";
|
||||
let expected = r#"
|
||||
+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| data | Json<{"_raw":"<String>","commit.collection":"<String>","commit.operation":"<String>","did":"<String>","kind":"<String>","time_us":"<Number>"}> | | YES | | FIELD |
|
||||
| time_us | TimestampMicrosecond | PRI | NO | | TIMESTAMP |
|
||||
+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+"#;
|
||||
+---------+----------------------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+---------+----------------------+-----+------+---------+---------------+
|
||||
| data | JSON2 | | YES | | FIELD |
|
||||
| time_us | TimestampMicrosecond | PRI | NO | | TIMESTAMP |
|
||||
+---------+----------------------+-----+------+---------+---------------+"#;
|
||||
execute_sql_and_expect(frontend, sql, expected).await;
|
||||
}
|
||||
|
||||
async fn create_table(frontend: &Arc<Instance>) {
|
||||
let sql = r#"
|
||||
CREATE TABLE bluesky (
|
||||
"data" JSON (
|
||||
format = "partial",
|
||||
fields = Struct<
|
||||
kind String,
|
||||
"commit.operation" String,
|
||||
"commit.collection" String,
|
||||
did String,
|
||||
time_us Bigint
|
||||
>,
|
||||
),
|
||||
"data" JSON2,
|
||||
time_us TimestampMicrosecond TIME INDEX,
|
||||
)
|
||||
) WITH ('append_mode' = 'true', 'sst_format' = 'flat')
|
||||
"#;
|
||||
execute_sql_and_expect(frontend, sql, "Affected Rows: 0").await;
|
||||
}
|
||||
@@ -1,82 +0,0 @@
|
||||
CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured") DEFAULT '{"foo": "bar"}');
|
||||
|
||||
Error: 1001(Unsupported), Unsupported default constraint for column: 'j', reason: json column cannot have a default value
|
||||
|
||||
CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured"));
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
+--------+----------------------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+--------+----------------------+-----+------+---------+---------------+
|
||||
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
|
||||
| j | Json<"<Null>"> | | YES | | FIELD |
|
||||
+--------+----------------------+-----+------+---------+---------------+
|
||||
|
||||
INSERT INTO t VALUES
|
||||
(1762128001000, '{"int": 1}'),
|
||||
(1762128002000, '{"int": 2, "list": [0.1, 0.2, 0.3]}'),
|
||||
(1762128003000, '{"int": 3, "list": [0.4, 0.5, 0.6], "nested": {"a": {"x": "hello"}, "b": {"y": -1}}}');
|
||||
|
||||
Affected Rows: 3
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
|
||||
| j | Json<{"int":"<Number>","list":["<Number>"],"nested":{"a":{"x":"<String>"},"b":{"y":"<Number>"}}}> | | YES | | FIELD |
|
||||
+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
|
||||
INSERT INTO t VALUES
|
||||
(1762128004000, '{"int": 4, "bool": true, "nested": {"a": {"y": 1}}}'),
|
||||
(1762128005000, '{"int": 5, "bool": false, "nested": {"b": {"x": "world"}}}');
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
|
||||
| j | Json<{"bool":"<Bool>","int":"<Number>","list":["<Number>"],"nested":{"a":{"x":"<String>","y":"<Number>"},"b":{"x":"<String>","y":"<Number>"}}}> | | YES | | FIELD |
|
||||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
|
||||
INSERT INTO t VALUES (1762128006000, '{"int": 6, "list": [-6.0], "bool": true, "nested": {"a": {"x": "ax", "y": 66}, "b": {"y": -66, "x": "bx"}}}');
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
|
||||
| j | Json<{"bool":"<Bool>","int":"<Number>","list":["<Number>"],"nested":{"a":{"x":"<String>","y":"<Number>"},"b":{"x":"<String>","y":"<Number>"}}}> | | YES | | FIELD |
|
||||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
|
||||
|
||||
INSERT INTO t VALUES (1762128011000, '{}');
|
||||
|
||||
Error: 1004(InvalidArguments), Invalid InsertRequest, reason: empty json object is not supported, consider adding a dummy field
|
||||
|
||||
SELECT ts, j FROM t order by ts;
|
||||
|
||||
+---------------------+----------------------------------------------------------------------------------------+
|
||||
| ts | j |
|
||||
+---------------------+----------------------------------------------------------------------------------------+
|
||||
| 2025-11-03T00:00:01 | {bool: , int: 1, list: , nested: } |
|
||||
| 2025-11-03T00:00:02 | {bool: , int: 2, list: [0.1, 0.2, 0.3], nested: } |
|
||||
| 2025-11-03T00:00:03 | {bool: , int: 3, list: [0.4, 0.5, 0.6], nested: {a: {x: hello, y: }, b: {x: , y: -1}}} |
|
||||
| 2025-11-03T00:00:04 | {bool: true, int: 4, list: , nested: {a: {x: , y: 1}, b: }} |
|
||||
| 2025-11-03T00:00:05 | {bool: false, int: 5, list: , nested: {a: , b: {x: world, y: }}} |
|
||||
| 2025-11-03T00:00:06 | {bool: true, int: 6, list: [-6.0], nested: {a: {x: ax, y: 66}, b: {x: bx, y: -66}}} |
|
||||
+---------------------+----------------------------------------------------------------------------------------+
|
||||
|
||||
DROP table t;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured") DEFAULT '{"foo": "bar"}');
|
||||
|
||||
CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured"));
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
INSERT INTO t VALUES
|
||||
(1762128001000, '{"int": 1}'),
|
||||
(1762128002000, '{"int": 2, "list": [0.1, 0.2, 0.3]}'),
|
||||
(1762128003000, '{"int": 3, "list": [0.4, 0.5, 0.6], "nested": {"a": {"x": "hello"}, "b": {"y": -1}}}');
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
INSERT INTO t VALUES
|
||||
(1762128004000, '{"int": 4, "bool": true, "nested": {"a": {"y": 1}}}'),
|
||||
(1762128005000, '{"int": 5, "bool": false, "nested": {"b": {"x": "world"}}}');
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
INSERT INTO t VALUES (1762128006000, '{"int": 6, "list": [-6.0], "bool": true, "nested": {"a": {"x": "ax", "y": 66}, "b": {"y": -66, "x": "bx"}}}');
|
||||
|
||||
DESC TABLE t;
|
||||
|
||||
INSERT INTO t VALUES (1762128011000, '{}');
|
||||
|
||||
SELECT ts, j FROM t order by ts;
|
||||
|
||||
DROP table t;
|
||||
149
tests/cases/standalone/common/types/json/json2.result
Normal file
149
tests/cases/standalone/common/types/json/json2.result
Normal file
@@ -0,0 +1,149 @@
|
||||
create table json2_table (
|
||||
ts timestamp time index,
|
||||
j json2
|
||||
) with (
|
||||
'append_mode' = 'true',
|
||||
'sst_format' = 'flat',
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
insert into json2_table (ts, j)
|
||||
values (1, '{"a": {"b": 1}, "c": "s1"}'),
|
||||
(2, '{"a": {"b": -2}, "c": "s2"}');
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
admin flush_table('json2_table');
|
||||
|
||||
+----------------------------------+
|
||||
| ADMIN flush_table('json2_table') |
|
||||
+----------------------------------+
|
||||
| 0 |
|
||||
+----------------------------------+
|
||||
|
||||
insert into json2_table (ts, j)
|
||||
values (3, '{"a": {"b": 3}, "c": "s3"}');
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
insert into json2_table
|
||||
values (4, '{"a": {"b": -4}}'),
|
||||
(5, '{"a": {}, "c": "s5"}'),
|
||||
(6, '{"c": "s6"}');
|
||||
|
||||
Affected Rows: 3
|
||||
|
||||
admin flush_table('json2_table');
|
||||
|
||||
+----------------------------------+
|
||||
| ADMIN flush_table('json2_table') |
|
||||
+----------------------------------+
|
||||
| 0 |
|
||||
+----------------------------------+
|
||||
|
||||
insert into json2_table
|
||||
values (7, '{"a": {"b": "s7"}, "c": [1]}'),
|
||||
(8, '{"a": {"b": 8}, "c": "s8"}');
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
insert into json2_table
|
||||
values (9, '{"a": {"x": true}, "c": "s9"}'),
|
||||
(10, '{"a": {"b": 10}, "y": false}');
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
select j.a.b from json2_table order by ts;
|
||||
|
||||
+-----------------------------------------------------+
|
||||
| json2_get(json2_table.j,Utf8("a.b"),Utf8View(NULL)) |
|
||||
+-----------------------------------------------------+
|
||||
| 1 |
|
||||
| -2 |
|
||||
| 3 |
|
||||
| -4 |
|
||||
| |
|
||||
| |
|
||||
| s7 |
|
||||
| 8 |
|
||||
| |
|
||||
| 10 |
|
||||
+-----------------------------------------------------+
|
||||
|
||||
select j.a, j.a.x from json2_table order by ts;
|
||||
|
||||
+---------------------------------------------------+-----------------------------------------------------+
|
||||
| json2_get(json2_table.j,Utf8("a"),Utf8View(NULL)) | json2_get(json2_table.j,Utf8("a.x"),Utf8View(NULL)) |
|
||||
+---------------------------------------------------+-----------------------------------------------------+
|
||||
| {b: 1, x: } | |
|
||||
| {b: -2, x: } | |
|
||||
| {b: 3, x: } | |
|
||||
| {b: -4, x: } | |
|
||||
| {b: , x: } | |
|
||||
| | |
|
||||
| {b: s7, x: } | |
|
||||
| {b: 8, x: } | |
|
||||
| {b: , x: true} | true |
|
||||
| {b: 10, x: } | |
|
||||
+---------------------------------------------------+-----------------------------------------------------+
|
||||
|
||||
select j.c, j.y from json2_table order by ts;
|
||||
|
||||
+---------------------------------------------------+---------------------------------------------------+
|
||||
| json2_get(json2_table.j,Utf8("c"),Utf8View(NULL)) | json2_get(json2_table.j,Utf8("y"),Utf8View(NULL)) |
|
||||
+---------------------------------------------------+---------------------------------------------------+
|
||||
| s1 | |
|
||||
| s2 | |
|
||||
| s3 | |
|
||||
| | |
|
||||
| s5 | |
|
||||
| s6 | |
|
||||
| [1] | |
|
||||
| s8 | |
|
||||
| s9 | |
|
||||
| | false |
|
||||
+---------------------------------------------------+---------------------------------------------------+
|
||||
|
||||
select j from json2_table order by ts;
|
||||
|
||||
Error: 3001(EngineExecuteQuery), Invalid argument error: column types must match schema types, expected Struct() but found Struct("a": Struct("b": Utf8, "x": Boolean), "c": Utf8, "y": Boolean) at column index 0
|
||||
|
||||
select * from json2_table order by ts;
|
||||
|
||||
Error: 3001(EngineExecuteQuery), Invalid argument error: column types must match schema types, expected Struct() but found Struct("a": Struct("b": Utf8, "x": Boolean), "c": Utf8, "y": Boolean) at column index 1
|
||||
|
||||
select j.a.b + 1 from json2_table order by ts;
|
||||
|
||||
+-------------------------------------------------------------+
|
||||
| json2_get(json2_table.j,Utf8("a.b"),Int64(NULL)) + Int64(1) |
|
||||
+-------------------------------------------------------------+
|
||||
| 2 |
|
||||
| -1 |
|
||||
| 4 |
|
||||
| -3 |
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
| 9 |
|
||||
| |
|
||||
| 11 |
|
||||
+-------------------------------------------------------------+
|
||||
|
||||
select abs(j.a.b) from json2_table order by ts;
|
||||
|
||||
Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String No function matches the given name and argument types 'abs(Utf8View)'. You might need to add explicit type casts.
|
||||
Candidate functions:
|
||||
abs(Numeric(1))
|
||||
|
||||
-- "j.c" is of type "String", "abs" is expected to be all "null"s.
|
||||
select abs(j.c) from json2_table order by ts;
|
||||
|
||||
Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String No function matches the given name and argument types 'abs(Utf8View)'. You might need to add explicit type casts.
|
||||
Candidate functions:
|
||||
abs(Numeric(1))
|
||||
|
||||
drop table json2_table;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
50
tests/cases/standalone/common/types/json/json2.sql
Normal file
50
tests/cases/standalone/common/types/json/json2.sql
Normal file
@@ -0,0 +1,50 @@
|
||||
create table json2_table (
|
||||
ts timestamp time index,
|
||||
j json2
|
||||
) with (
|
||||
'append_mode' = 'true',
|
||||
'sst_format' = 'flat',
|
||||
);
|
||||
|
||||
insert into json2_table (ts, j)
|
||||
values (1, '{"a": {"b": 1}, "c": "s1"}'),
|
||||
(2, '{"a": {"b": -2}, "c": "s2"}');
|
||||
|
||||
admin flush_table('json2_table');
|
||||
|
||||
insert into json2_table (ts, j)
|
||||
values (3, '{"a": {"b": 3}, "c": "s3"}');
|
||||
|
||||
insert into json2_table
|
||||
values (4, '{"a": {"b": -4}}'),
|
||||
(5, '{"a": {}, "c": "s5"}'),
|
||||
(6, '{"c": "s6"}');
|
||||
|
||||
admin flush_table('json2_table');
|
||||
|
||||
insert into json2_table
|
||||
values (7, '{"a": {"b": "s7"}, "c": [1]}'),
|
||||
(8, '{"a": {"b": 8}, "c": "s8"}');
|
||||
|
||||
insert into json2_table
|
||||
values (9, '{"a": {"x": true}, "c": "s9"}'),
|
||||
(10, '{"a": {"b": 10}, "y": false}');
|
||||
|
||||
select j.a.b from json2_table order by ts;
|
||||
|
||||
select j.a, j.a.x from json2_table order by ts;
|
||||
|
||||
select j.c, j.y from json2_table order by ts;
|
||||
|
||||
select j from json2_table order by ts;
|
||||
|
||||
select * from json2_table order by ts;
|
||||
|
||||
select j.a.b + 1 from json2_table order by ts;
|
||||
|
||||
select abs(j.a.b) from json2_table order by ts;
|
||||
|
||||
-- "j.c" is of type "String", "abs" is expected to be all "null"s.
|
||||
select abs(j.c) from json2_table order by ts;
|
||||
|
||||
drop table json2_table;
|
||||
176
tests/cases/standalone/common/types/json/jsonbench.result
Normal file
176
tests/cases/standalone/common/types/json/jsonbench.result
Normal file
@@ -0,0 +1,176 @@
|
||||
CREATE TABLE bluesky (
|
||||
`data` JSON2,
|
||||
time_us TimestampMicrosecond TIME INDEX
|
||||
) WITH ('append_mode' = 'true', 'sst_format' = 'flat');
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349000167,
|
||||
'{"did":"did:plc:yj3sjq3blzpynh27cumnp5ks","time_us":1732206349000167,"kind":"commit","commit":{"rev":"3lbhtytnn2k2f","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtyteurk2y","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.095Z","langs":["en"],"reply":{"parent":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"},"root":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"}},"text":"aaaaah. LIght shines in a corner of WTF...."},"cid":"bafyreidblutgvj75o4q4akzyyejedjj6l3it6hgqwee6jpwv2wqph5fsgm"}}');
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349000644,
|
||||
'{"did":"did:plc:3i4xf2v4wcnyktgv6satke64","time_us":1732206349000644,"kind":"commit","commit":{"rev":"3lbhuvzds6d2a","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhuvzdked2a","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.221Z","subject":{"cid":"bafyreidjvrcmckkm765mct5fph36x7kupkfo35rjklbf2k76xkzwyiauge","uri":"at://did:plc:azrv4rcbws6kmcga4fsbphg2/app.bsky.feed.post/3lbgjdpbiec2l"}},"cid":"bafyreia5l5vrkh5oj4cjyhcqby2dprhyvcyofo2q5562tijlae2pzih23m"}}');
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
ADMIN flush_table('bluesky');
|
||||
|
||||
+------------------------------+
|
||||
| ADMIN flush_table('bluesky') |
|
||||
+------------------------------+
|
||||
| 0 |
|
||||
+------------------------------+
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349001108,
|
||||
'{"did":"did:plc:gccfnqqizz4urhchsaie6jft","time_us":1732206349001108,"kind":"commit","commit":{"rev":"3lbhuvze3gi2u","operation":"create","collection":"app.bsky.graph.follow","rkey":"3lbhuvzdtmi2u","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:27:40.923Z","subject":"did:plc:r7cdh4sgzqbfdc6wcdxxti7c"},"cid":"bafyreiew2p6cgirfaj45qoenm4fgumib7xoloclrap3jgkz5es7g7kby3i"}}');
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349001372,
|
||||
'{"did":"did:plc:msxqf3twq7abtdw7dbfskphk","time_us":1732206349001372,"kind":"commit","commit":{"rev":"3lbhueija5p22","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhueiizcx22","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:15:58.232Z","subject":{"cid":"bafyreiavpshyqzrlo5m7fqodjhs6jevweqnif4phasiwimv4a7mnsqi2fe","uri":"at://did:plc:fusulxqc52zbrc75fi6xrcof/app.bsky.feed.post/3lbhskq5zn22f"}},"cid":"bafyreidjix4dauj2afjlbzmhj3a7gwftcevvmmy6edww6vrjdbst26rkby"}}');
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
ADMIN flush_table('bluesky');
|
||||
|
||||
+------------------------------+
|
||||
| ADMIN flush_table('bluesky') |
|
||||
+------------------------------+
|
||||
| 0 |
|
||||
+------------------------------+
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349001905,
|
||||
'{"did":"did:plc:l5o3qjrmfztir54cpwlv2eme","time_us":1732206349001905,"kind":"commit","commit":{"rev":"3lbhtytohxc2o","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtytjqzk2q","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.254Z","langs":["en"],"reply":{"parent":{"cid":"bafyreih35fe2jj3gchmgk4amold4l6sfxd2sby5wrg3jrws5fkdypxrbg4","uri":"at://did:plc:6wx2gg5yqgvmlu35r6y3bk6d/app.bsky.feed.post/3lbhtj2eb4s2o"},"root":{"cid":"bafyreifipyt3vctd4ptuoicvio7rbr5xvjv4afwuggnd2prnmn55mu6luu","uri":"at://did:plc:474ldquxwzrlcvjhhbbk2wte/app.bsky.feed.post/3lbhdzrynik27"}},"text":"okay i take mine back because I hadn’t heard this one yet^^"},"cid":"bafyreigzdsdne3z2xxcakgisieyj7y47hj6eg7lj6v4q25ah5q2qotu5ku"}}');
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
ADMIN compact_table('bluesky', 'swcs', '86400');
|
||||
|
||||
+-------------------------------------------------+
|
||||
| ADMIN compact_table('bluesky', 'swcs', '86400') |
|
||||
+-------------------------------------------------+
|
||||
| 0 |
|
||||
+-------------------------------------------------+
|
||||
|
||||
SELECT count(*) FROM bluesky;
|
||||
|
||||
+----------+
|
||||
| count(*) |
|
||||
+----------+
|
||||
| 5 |
|
||||
+----------+
|
||||
|
||||
-- Query 1:
|
||||
SELECT data.commit.collection AS event,
|
||||
count() AS count
|
||||
FROM bluesky
|
||||
GROUP BY event
|
||||
ORDER BY count DESC, event ASC;
|
||||
|
||||
+-----------------------+-------+
|
||||
| event | count |
|
||||
+-----------------------+-------+
|
||||
| app.bsky.feed.like | 2 |
|
||||
| app.bsky.feed.post | 2 |
|
||||
| app.bsky.graph.follow | 1 |
|
||||
+-----------------------+-------+
|
||||
|
||||
-- Query 2:
|
||||
SELECT data.commit.collection AS event,
|
||||
count() AS count,
|
||||
count(DISTINCT data.did) AS users
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit' AND data.commit.operation = 'create'
|
||||
GROUP BY event
|
||||
ORDER BY count DESC, event ASC;
|
||||
|
||||
+-----------------------+-------+-------+
|
||||
| event | count | users |
|
||||
+-----------------------+-------+-------+
|
||||
| app.bsky.feed.like | 2 | 2 |
|
||||
| app.bsky.feed.post | 2 | 2 |
|
||||
| app.bsky.graph.follow | 1 | 1 |
|
||||
+-----------------------+-------+-------+
|
||||
|
||||
-- Query 3:
|
||||
SELECT data.commit.collection AS event,
|
||||
date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day,
|
||||
count() AS count
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit'
|
||||
AND data.commit.operation = 'create'
|
||||
AND data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
|
||||
GROUP BY event, hour_of_day
|
||||
ORDER BY hour_of_day, event;
|
||||
|
||||
+--------------------+-------------+-------+
|
||||
| event | hour_of_day | count |
|
||||
+--------------------+-------------+-------+
|
||||
| app.bsky.feed.like | 16 | 2 |
|
||||
| app.bsky.feed.post | 16 | 2 |
|
||||
+--------------------+-------------+-------+
|
||||
|
||||
-- Query 4:
|
||||
SELECT data.did::String as user_id,
|
||||
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit'
|
||||
AND data.commit.operation = 'create'
|
||||
AND data.commit.collection = 'app.bsky.feed.post'
|
||||
GROUP BY user_id
|
||||
ORDER BY first_post_ts ASC, user_id DESC
|
||||
LIMIT 3;
|
||||
|
||||
+----------------------------------+----------------------------+
|
||||
| user_id | first_post_ts |
|
||||
+----------------------------------+----------------------------+
|
||||
| did:plc:yj3sjq3blzpynh27cumnp5ks | 2024-11-21T16:25:49.000167 |
|
||||
| did:plc:l5o3qjrmfztir54cpwlv2eme | 2024-11-21T16:25:49.001905 |
|
||||
+----------------------------------+----------------------------+
|
||||
|
||||
-- Query 5:
|
||||
SELECT data.did::String as user_id,
|
||||
date_part(
|
||||
'epoch',
|
||||
max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) -
|
||||
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))
|
||||
) AS activity_span
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit'
|
||||
AND data.commit.operation = 'create'
|
||||
AND data.commit.collection = 'app.bsky.feed.post'
|
||||
GROUP BY user_id
|
||||
ORDER BY activity_span DESC, user_id DESC
|
||||
LIMIT 3;
|
||||
|
||||
+----------------------------------+---------------+
|
||||
| user_id | activity_span |
|
||||
+----------------------------------+---------------+
|
||||
| did:plc:yj3sjq3blzpynh27cumnp5ks | 0.0 |
|
||||
| did:plc:l5o3qjrmfztir54cpwlv2eme | 0.0 |
|
||||
+----------------------------------+---------------+
|
||||
|
||||
-- SQLNESS REPLACE (peers.*) REDACTED
|
||||
EXPLAIN
|
||||
SELECT date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day
|
||||
FROM bluesky;
|
||||
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | MergeScan [is_placeholder=false, remote_input=[ |
|
||||
| | Projection: date_part(Utf8("hour"), to_timestamp_micros(json2_get(bluesky.data, Utf8("time_us"), Int64(NULL)))) AS hour_of_day |
|
||||
| | TableScan: bluesky |
|
||||
| | ]] |
|
||||
| physical_plan | CooperativeExec |
|
||||
| | MergeScanExec: REDACTED
|
||||
| | |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
90
tests/cases/standalone/common/types/json/jsonbench.sql
Normal file
90
tests/cases/standalone/common/types/json/jsonbench.sql
Normal file
@@ -0,0 +1,90 @@
|
||||
CREATE TABLE bluesky (
|
||||
`data` JSON2,
|
||||
time_us TimestampMicrosecond TIME INDEX
|
||||
) WITH ('append_mode' = 'true', 'sst_format' = 'flat');
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349000167,
|
||||
'{"did":"did:plc:yj3sjq3blzpynh27cumnp5ks","time_us":1732206349000167,"kind":"commit","commit":{"rev":"3lbhtytnn2k2f","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtyteurk2y","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.095Z","langs":["en"],"reply":{"parent":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"},"root":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"}},"text":"aaaaah. LIght shines in a corner of WTF...."},"cid":"bafyreidblutgvj75o4q4akzyyejedjj6l3it6hgqwee6jpwv2wqph5fsgm"}}');
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349000644,
|
||||
'{"did":"did:plc:3i4xf2v4wcnyktgv6satke64","time_us":1732206349000644,"kind":"commit","commit":{"rev":"3lbhuvzds6d2a","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhuvzdked2a","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.221Z","subject":{"cid":"bafyreidjvrcmckkm765mct5fph36x7kupkfo35rjklbf2k76xkzwyiauge","uri":"at://did:plc:azrv4rcbws6kmcga4fsbphg2/app.bsky.feed.post/3lbgjdpbiec2l"}},"cid":"bafyreia5l5vrkh5oj4cjyhcqby2dprhyvcyofo2q5562tijlae2pzih23m"}}');
|
||||
|
||||
ADMIN flush_table('bluesky');
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349001108,
|
||||
'{"did":"did:plc:gccfnqqizz4urhchsaie6jft","time_us":1732206349001108,"kind":"commit","commit":{"rev":"3lbhuvze3gi2u","operation":"create","collection":"app.bsky.graph.follow","rkey":"3lbhuvzdtmi2u","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:27:40.923Z","subject":"did:plc:r7cdh4sgzqbfdc6wcdxxti7c"},"cid":"bafyreiew2p6cgirfaj45qoenm4fgumib7xoloclrap3jgkz5es7g7kby3i"}}');
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349001372,
|
||||
'{"did":"did:plc:msxqf3twq7abtdw7dbfskphk","time_us":1732206349001372,"kind":"commit","commit":{"rev":"3lbhueija5p22","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhueiizcx22","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:15:58.232Z","subject":{"cid":"bafyreiavpshyqzrlo5m7fqodjhs6jevweqnif4phasiwimv4a7mnsqi2fe","uri":"at://did:plc:fusulxqc52zbrc75fi6xrcof/app.bsky.feed.post/3lbhskq5zn22f"}},"cid":"bafyreidjix4dauj2afjlbzmhj3a7gwftcevvmmy6edww6vrjdbst26rkby"}}');
|
||||
|
||||
ADMIN flush_table('bluesky');
|
||||
|
||||
INSERT INTO bluesky (time_us, data)
|
||||
VALUES (1732206349001905,
|
||||
'{"did":"did:plc:l5o3qjrmfztir54cpwlv2eme","time_us":1732206349001905,"kind":"commit","commit":{"rev":"3lbhtytohxc2o","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtytjqzk2q","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.254Z","langs":["en"],"reply":{"parent":{"cid":"bafyreih35fe2jj3gchmgk4amold4l6sfxd2sby5wrg3jrws5fkdypxrbg4","uri":"at://did:plc:6wx2gg5yqgvmlu35r6y3bk6d/app.bsky.feed.post/3lbhtj2eb4s2o"},"root":{"cid":"bafyreifipyt3vctd4ptuoicvio7rbr5xvjv4afwuggnd2prnmn55mu6luu","uri":"at://did:plc:474ldquxwzrlcvjhhbbk2wte/app.bsky.feed.post/3lbhdzrynik27"}},"text":"okay i take mine back because I hadn’t heard this one yet^^"},"cid":"bafyreigzdsdne3z2xxcakgisieyj7y47hj6eg7lj6v4q25ah5q2qotu5ku"}}');
|
||||
|
||||
ADMIN compact_table('bluesky', 'swcs', '86400');
|
||||
|
||||
SELECT count(*) FROM bluesky;
|
||||
|
||||
-- Query 1:
|
||||
SELECT data.commit.collection AS event,
|
||||
count() AS count
|
||||
FROM bluesky
|
||||
GROUP BY event
|
||||
ORDER BY count DESC, event ASC;
|
||||
|
||||
-- Query 2:
|
||||
SELECT data.commit.collection AS event,
|
||||
count() AS count,
|
||||
count(DISTINCT data.did) AS users
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit' AND data.commit.operation = 'create'
|
||||
GROUP BY event
|
||||
ORDER BY count DESC, event ASC;
|
||||
|
||||
-- Query 3:
|
||||
SELECT data.commit.collection AS event,
|
||||
date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day,
|
||||
count() AS count
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit'
|
||||
AND data.commit.operation = 'create'
|
||||
AND data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
|
||||
GROUP BY event, hour_of_day
|
||||
ORDER BY hour_of_day, event;
|
||||
|
||||
-- Query 4:
|
||||
SELECT data.did::String as user_id,
|
||||
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit'
|
||||
AND data.commit.operation = 'create'
|
||||
AND data.commit.collection = 'app.bsky.feed.post'
|
||||
GROUP BY user_id
|
||||
ORDER BY first_post_ts ASC, user_id DESC
|
||||
LIMIT 3;
|
||||
|
||||
-- Query 5:
|
||||
SELECT data.did::String as user_id,
|
||||
date_part(
|
||||
'epoch',
|
||||
max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) -
|
||||
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))
|
||||
) AS activity_span
|
||||
FROM bluesky
|
||||
WHERE data.kind = 'commit'
|
||||
AND data.commit.operation = 'create'
|
||||
AND data.commit.collection = 'app.bsky.feed.post'
|
||||
GROUP BY user_id
|
||||
ORDER BY activity_span DESC, user_id DESC
|
||||
LIMIT 3;
|
||||
|
||||
-- SQLNESS REPLACE (peers.*) REDACTED
|
||||
EXPLAIN
|
||||
SELECT date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day
|
||||
FROM bluesky;
|
||||
Reference in New Issue
Block a user