From 869a584f8af7dd2d9f5f9ba5ff86f7a29857b6e7 Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 29 May 2026 15:07:49 +0800 Subject: [PATCH] ci: add nightly jsonbench test (#7750) Signed-off-by: luofucong --- .github/workflows/nightly-jsonbench.yaml | 162 ++++++++++++++++++ src/datatypes/src/json.rs | 85 +++++---- src/datatypes/src/json/value.rs | 108 +++++++----- src/datatypes/src/types/json_type.rs | 10 +- src/datatypes/src/vectors/json/builder.rs | 4 +- .../standalone/common/types/json/json2.result | 23 ++- .../standalone/common/types/json/json2.sql | 9 + 7 files changed, 315 insertions(+), 86 deletions(-) create mode 100644 .github/workflows/nightly-jsonbench.yaml diff --git a/.github/workflows/nightly-jsonbench.yaml b/.github/workflows/nightly-jsonbench.yaml new file mode 100644 index 0000000000..3667ee26a6 --- /dev/null +++ b/.github/workflows/nightly-jsonbench.yaml @@ -0,0 +1,162 @@ +name: Nightly JSONBench + +on: + schedule: + # Trigger at 00:00(Asia/Shanghai) on every weekday. + - cron: "0 16 * * 0-4" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + allocate-runner: + name: Allocate runner + if: ${{ github.repository == 'GreptimeTeam/greptimedb' }} + runs-on: ubuntu-latest + outputs: + linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }} + + # The following EC2 resource id will be used for resource releasing. + linux-arm64-ec2-runner-label: ${{ steps.start-linux-arm64-runner.outputs.label }} + linux-arm64-ec2-runner-instance-id: ${{ steps.start-linux-arm64-runner.outputs.ec2-instance-id }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Allocate Linux ARM64 runner + uses: ./.github/actions/start-runner + id: start-linux-arm64-runner + with: + runner: ${{ vars.DEFAULT_ARM64_RUNNER }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.EC2_RUNNER_REGION }} + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + image-id: ${{ vars.EC2_RUNNER_LINUX_ARM64_IMAGE_ID }} + security-group-id: ${{ vars.EC2_RUNNER_SECURITY_GROUP_ID }} + subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }} + + jsonbench: + name: Run JSONBench + if: ${{ github.repository == 'GreptimeTeam/greptimedb' }} + needs: [ allocate-runner ] + runs-on: ${{ needs.allocate-runner.outputs.linux-arm64-runner }} + timeout-minutes: 120 + env: + JSONBENCH_DATA_DIR: /home/runner/data/bluesky + JSONBENCH_OUTPUT_PREFIX: _ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - uses: arduino/setup-protoc@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + + - name: Rust Cache + uses: Swatinem/rust-cache@v2 + with: + shared-key: "nightly-jsonbench" + cache-all-crates: "true" + save-if: ${{ github.ref == 'refs/heads/main' }} + + - name: Build GreptimeDB + run: cargo build --profile nightly --bin greptime + + - name: Reclaim disk space + shell: bash + run: | + set -euo pipefail + + mkdir -p "${RUNNER_TEMP}/greptimedb-bin" + cp ./target/nightly/greptime "${RUNNER_TEMP}/greptimedb-bin/greptime" + chmod +x "${RUNNER_TEMP}/greptimedb-bin/greptime" + + rm -rf ./target + + - name: Run JSONBench + shell: bash + run: | + set -euo pipefail + + cd "${RUNNER_TEMP}" + cp "${RUNNER_TEMP}/greptimedb-bin/greptime" ./greptime + chmod +x ./greptime + + export GREPTIMEDB_STANDALONE__WAL__DIR=greptimedb_data/wal + export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME=greptimedb_data + export GREPTIMEDB_STANDALONE__LOGGING__DIR=greptimedb_data/logs + export GREPTIMEDB_STANDALONE__LOGGING__APPEND_STDOUT=false + export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB + export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s + + ./greptime standalone start > greptimedb.log 2>&1 & + greptime_pid=$! + trap 'kill "${greptime_pid}" 2>/dev/null || true' EXIT + + until curl -s --fail -o /dev/null http://localhost:4000/health; do + if ! kill -0 "${greptime_pid}" 2>/dev/null; then + cat greptimedb.log + exit 1 + fi + sleep 1 + done + + git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench + cp ./greptime JSONBench/greptimedb/greptime + + cd JSONBench/greptimedb + ./main.sh 3 "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false + + - name: Upload JSONBench results + if: always() + uses: actions/upload-artifact@v4 + with: + name: jsonbench-results + path: | + ${{ runner.temp }}/greptimedb.log + ${{ runner.temp }}/JSONBench/greptimedb/*.log + ${{ runner.temp }}/JSONBench/greptimedb/*.total_size + ${{ runner.temp }}/JSONBench/greptimedb/*.data_size + ${{ runner.temp }}/JSONBench/greptimedb/*.index_size + ${{ runner.temp }}/JSONBench/greptimedb/*.count + ${{ runner.temp }}/JSONBench/greptimedb/*.results_runtime + ${{ runner.temp }}/JSONBench/greptimedb/*.query_results + if-no-files-found: ignore + retention-days: 7 + + stop-linux-arm64-runner: + name: Stop Linux ARM64 runner + # It's always run as the last job in the workflow to make sure that the runner is released. + if: ${{ always() }} + runs-on: ubuntu-latest + needs: [ + allocate-runner, + jsonbench, + ] + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Stop Linux ARM64 runner + uses: ./.github/actions/stop-runner + with: + label: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-label }} + ec2-instance-id: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-instance-id }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.EC2_RUNNER_REGION }} + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs index db657abbcb..33104084ad 100644 --- a/src/datatypes/src/json.rs +++ b/src/datatypes/src/json.rs @@ -26,12 +26,12 @@ use std::sync::Arc; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value as Json}; -use snafu::{OptionExt, ResultExt, ensure}; +use snafu::{OptionExt, ResultExt}; use crate::error::{self, InvalidJsonSnafu, Result, SerializeSnafu}; use crate::json::value::{JsonValue, JsonVariant}; use crate::types::json_type::{JsonNativeType, JsonNumberType, JsonObjectType}; -use crate::types::{StructField, StructType}; +use crate::types::{JsonType, StructField, StructType}; use crate::value::{ListValue, StructValue, Value}; /// The configuration of JSON encoding @@ -305,33 +305,47 @@ fn encode_json_array_with_context<'a>( ) -> Result { let json_array_len = json_array.len(); let mut items = Vec::with_capacity(json_array_len); - let mut element_type = item_type.cloned(); for (index, value) in json_array.into_iter().enumerate() { let array_context = context.with_key(&index.to_string()); - let item_value = - encode_json_value_with_context(value, element_type.as_ref(), &array_context)?; - let item_type = item_value.json_type().native_type().clone(); - items.push(item_value.into_variant()); - - // Determine the common type for the list - if let Some(current_type) = &element_type { - // It's valid for json array to have different types of items, for example, - // ["a string", 1]. However, the `JsonValue` will be converted to Arrow list array, - // which requires all items have exactly same type. So we forbid the different types - // case here. Besides, it's not common for items in a json array to differ. So I think - // we are good here. - ensure!( - item_type == *current_type, - error::InvalidJsonSnafu { - value: "all items in json array must have the same type" - } - ); - } else { - element_type = Some(item_type); - } + let item_value = encode_json_value_with_context(value, None, &array_context)?; + items.push(item_value); } + // In specification, it's valid for a JSON array to have different types of items, for example, + // ["a string", 1]. However, in implementation, the `JsonValue` will be converted to Arrow list + // array, which requires all items have exactly the same type. So we merge out the maybe + // different item types to a unified type, and align all the item values to it. + + let provided_item_type = item_type.map(|x| JsonType::new_json2(x.clone())); + let merged_item_type = if let Some((first, rests)) = items.split_first() { + let mut merged = first.json_type().clone(); + for rest in rests.iter().map(|x| x.json_type()) { + if matches!(merged.native_type(), JsonNativeType::Variant) { + break; + } + merged.merge(rest)?; + } + Some(merged) + } else { + None + }; + let unified_item_type = match (provided_item_type, merged_item_type) { + (Some(mut x), Some(y)) => { + x.merge(&y)?; + Some(x) + } + (x, y) => x.or(y), + }; + if let Some(unified_item_type) = unified_item_type { + for item in &mut items { + item.try_align(&unified_item_type)?; + } + } + let items = items + .into_iter() + .map(|x| x.into_variant()) + .collect::>(); Ok(JsonValue::new(JsonVariant::Array(items))) } @@ -1050,11 +1064,8 @@ mod tests { fn test_encode_json_array_mixed_types() { let json = json!([1, "hello", true, 3.15]); let settings = JsonStructureSettings::Structured(None); - let result = settings.encode_with_type(json, None); - assert_eq!( - result.unwrap_err().to_string(), - "Invalid JSON: all items in json array must have the same type" - ); + let value = settings.encode_with_type(json, None).unwrap(); + assert_eq!(value.data_type().to_string(), r#"Json2[""]"#); } #[test] @@ -1276,12 +1287,12 @@ mod tests { #[test] fn test_encode_json_array_with_item_type() { let json = json!([1, 2, 3]); - let item_type = Arc::new(ConcreteDataType::uint64_datatype()); + let item_type = Arc::new(ConcreteDataType::int64_datatype()); let settings = JsonStructureSettings::Structured(None); let result = settings .encode_with_type( json, - Some(&JsonNativeType::Array(Box::new(JsonNativeType::u64()))), + Some(&JsonNativeType::Array(Box::new(JsonNativeType::i64()))), ) .unwrap() .into_json_inner() @@ -1289,9 +1300,9 @@ mod tests { if let Value::List(list_value) = result { assert_eq!(list_value.items().len(), 3); - assert_eq!(list_value.items()[0], Value::UInt64(1)); - assert_eq!(list_value.items()[1], Value::UInt64(2)); - assert_eq!(list_value.items()[2], Value::UInt64(3)); + assert_eq!(list_value.items()[0], Value::Int64(1)); + assert_eq!(list_value.items()[1], Value::Int64(2)); + assert_eq!(list_value.items()[2], Value::Int64(3)); assert_eq!(list_value.datatype(), item_type); } else { panic!("Expected List value"); @@ -2249,10 +2260,10 @@ mod tests { )])), ); - let decoded_struct = settings.decode_struct(array_struct); + let decoded_struct = settings.decode_struct(array_struct).unwrap(); assert_eq!( - decoded_struct.unwrap_err().to_string(), - "Invalid JSON: all items in json array must have the same type" + format!("{decoded_struct:?}"), + r#"StructValue { items: [List(ListValue { items: [Binary(Bytes(b"1")), Binary(Bytes(b"\"hello\"")), Binary(Bytes(b"true")), Binary(Bytes(b"3.15"))], datatype: Binary(BinaryType { repr_type: Binary }) })], fields: StructType { fields: [StructField { name: "value", data_type: List(ListType { item_type: Binary(BinaryType { repr_type: Binary }) }), nullable: true, metadata: {} }] } }"# ); } diff --git a/src/datatypes/src/json/value.rs b/src/datatypes/src/json/value.rs index f3b652a549..4350630003 100644 --- a/src/datatypes/src/json/value.rs +++ b/src/datatypes/src/json/value.rs @@ -65,6 +65,14 @@ impl JsonNumber { JsonNumber::Float(n) => n.0, } } + + fn native_type(&self) -> JsonNativeType { + match self { + JsonNumber::PosInt(_) => JsonNativeType::u64(), + JsonNumber::NegInt(_) => JsonNativeType::i64(), + JsonNumber::Float(_) => JsonNativeType::f64(), + } + } } impl From for JsonNumber { @@ -147,26 +155,14 @@ impl JsonVariant { match self { JsonVariant::Null => JsonNativeType::Null, JsonVariant::Bool(_) => JsonNativeType::Bool, - JsonVariant::Number(n) => match n { - JsonNumber::PosInt(_) => JsonNativeType::u64(), - JsonNumber::NegInt(_) => JsonNativeType::i64(), - JsonNumber::Float(_) => JsonNativeType::f64(), - }, + JsonVariant::Number(n) => n.native_type(), JsonVariant::String(_) => JsonNativeType::String, JsonVariant::Array(array) => { - let item_type = if let Some(first) = array.first() { - first.native_type() - } else { - JsonNativeType::Null - }; - JsonNativeType::Array(Box::new(item_type)) + json_array_native_type(array.iter().map(JsonVariant::native_type)) + } + JsonVariant::Object(object) => { + json_object_native_type(object.iter().map(|(k, v)| (k, v.native_type()))) } - JsonVariant::Object(object) => JsonNativeType::Object( - object - .iter() - .map(|(k, v)| (k.clone(), v.native_type())) - .collect(), - ), JsonVariant::Variant(_) => JsonNativeType::Variant, } } @@ -469,6 +465,7 @@ impl JsonValue { .collect::>()?, ), + (JsonVariant::Object(kvs), _) if kvs.is_empty() => JsonVariant::Null, (JsonVariant::Object(mut kvs), JsonNativeType::Object(expected)) => { ensure!( expected.keys().len() >= kvs.keys().len() @@ -517,7 +514,7 @@ impl JsonValue { let x = std::mem::take(&mut self.json_variant); self.json_variant = helper(x, expected.native_type())?; - self.json_type = OnceLock::from(expected.clone()); + self.json_type = OnceLock::new(); Ok(()) } } @@ -623,35 +620,55 @@ pub enum JsonVariantRef<'a> { } impl JsonVariantRef<'_> { - fn json_type(&self) -> JsonType { - fn native_type(v: &JsonVariantRef<'_>) -> JsonNativeType { - match v { - JsonVariantRef::Null => JsonNativeType::Null, - JsonVariantRef::Bool(_) => JsonNativeType::Bool, - JsonVariantRef::Number(n) => match n { - JsonNumber::PosInt(_) => JsonNativeType::u64(), - JsonNumber::NegInt(_) => JsonNativeType::i64(), - JsonNumber::Float(_) => JsonNativeType::f64(), - }, - JsonVariantRef::String(_) => JsonNativeType::String, - JsonVariantRef::Array(array) => { - let item_type = if let Some(first) = array.first() { - native_type(first) - } else { - JsonNativeType::Null - }; - JsonNativeType::Array(Box::new(item_type)) - } - JsonVariantRef::Object(object) => JsonNativeType::Object( - object - .iter() - .map(|(k, v)| (k.to_string(), native_type(v))) - .collect(), - ), - JsonVariantRef::Variant(_) => JsonNativeType::Variant, + fn native_type(&self) -> JsonNativeType { + match self { + JsonVariantRef::Null => JsonNativeType::Null, + JsonVariantRef::Bool(_) => JsonNativeType::Bool, + JsonVariantRef::Number(n) => n.native_type(), + JsonVariantRef::String(_) => JsonNativeType::String, + JsonVariantRef::Array(array) => { + json_array_native_type(array.iter().map(JsonVariantRef::native_type)) } + JsonVariantRef::Object(object) => { + json_object_native_type(object.iter().map(|(k, v)| (*k, v.native_type()))) + } + JsonVariantRef::Variant(_) => JsonNativeType::Variant, } - JsonType::new_json2(native_type(self)) + } + + fn json_type(&self) -> JsonType { + JsonType::new_json2(self.native_type()) + } +} + +fn json_array_native_type(items: I) -> JsonNativeType +where + I: IntoIterator, +{ + let mut iter = items.into_iter(); + let mut item_type = match iter.next() { + Some(t) => t, + None => return JsonNativeType::Array(Box::new(JsonNativeType::Null)), + }; + for x in iter { + if matches!(item_type, JsonNativeType::Variant) { + break; + } + item_type.merge(&x); + } + JsonNativeType::Array(Box::new(item_type)) +} + +fn json_object_native_type(fields: I) -> JsonNativeType +where + I: IntoIterator, + K: Into, +{ + let mut fields = fields.into_iter().peekable(); + if fields.peek().is_none() { + JsonNativeType::Null + } else { + JsonNativeType::Object(fields.map(|(k, v)| (k.into(), v)).collect()) } } @@ -941,7 +958,6 @@ mod tests { ("name".to_string(), JsonVariant::Null), ]))) ); - assert_eq!(value.json_type(), &expected); // Object alignment should fail if the expected type misses any field from the value. let expected = JsonType::new_json2(JsonNativeType::Object(JsonObjectType::from([( diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index e8d06543ed..652847da43 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -115,6 +115,14 @@ impl JsonNativeType { (JsonNativeType::Null, that) => that.clone(), (this, JsonNativeType::Null) => this, (this, that) if this == *that => this, + + (JsonNativeType::Number(x), JsonNativeType::Number(y)) => { + JsonNativeType::Number(match (x, y) { + (x, y) if x == *y => x, + (JsonNumberType::F64, _) | (_, JsonNumberType::F64) => JsonNumberType::F64, + _ => JsonNumberType::I64, + }) + } _ => JsonNativeType::Variant, }; } @@ -822,7 +830,7 @@ mod tests { test( "1.5", &mut JsonType::new_json2(JsonNativeType::i64()), - Ok(r#""""#), + Ok(r#""""#), )?; // Object merge should preserve existing fields and append missing fields. diff --git a/src/datatypes/src/vectors/json/builder.rs b/src/datatypes/src/vectors/json/builder.rs index be79a921c7..7ca1ff2f6a 100644 --- a/src/datatypes/src/vectors/json/builder.rs +++ b/src/datatypes/src/vectors/json/builder.rs @@ -89,7 +89,9 @@ impl MutableVector for JsonVectorBuilder { .fail(); }; let json_type = value.json_type(); - self.merged_type.merge(json_type)?; + if !self.merged_type.is_include(json_type) { + self.merged_type.merge(json_type)?; + } let value = JsonValue::new(JsonVariant::from(value.variant().clone())); self.values.push(value); diff --git a/tests/cases/standalone/common/types/json/json2.result b/tests/cases/standalone/common/types/json/json2.result index 71e119307c..7de73f2a78 100644 --- a/tests/cases/standalone/common/types/json/json2.result +++ b/tests/cases/standalone/common/types/json/json2.result @@ -126,7 +126,7 @@ select j.a, j.a.x from json2_table order by ts; | {"b":-2} | | | {"b":3} | | | {"b":-4} | | -| {"b":null} | | +| | | | | | | {"b":"s7"} | | | {"b":8} | | @@ -151,6 +151,14 @@ select j.c, j.y from json2_table order by ts; | | false | +-----------------------------------+-----------------------------------+ +select j from json2_table order by ts; + +Error: 3001(EngineExecuteQuery), Failed to align JSON array, reason: Invalid argument error: use StructArray::try_new_with_length or StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly + +select * from json2_table order by ts; + +Error: 3001(EngineExecuteQuery), Failed to align JSON array, reason: Invalid argument error: use StructArray::try_new_with_length or StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly + select j.a.b + 1 from json2_table order by ts; +------------------------------------------------------------+ @@ -168,6 +176,19 @@ select j.a.b + 1 from json2_table order by ts; | 11 | +------------------------------------------------------------+ +select abs(j.a.b) from json2_table order by ts; + +Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String No function matches the given name and argument types 'abs(Utf8View)'. You might need to add explicit type casts. + Candidate functions: + abs(Numeric(1)) + +-- "j.c" is of type "String", "abs" is expected to be all "null"s. +select abs(j.c) from json2_table order by ts; + +Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String No function matches the given name and argument types 'abs(Utf8View)'. You might need to add explicit type casts. + Candidate functions: + abs(Numeric(1)) + select j.d from json2_table order by ts; +-----------------------------------+ diff --git a/tests/cases/standalone/common/types/json/json2.sql b/tests/cases/standalone/common/types/json/json2.sql index 8dd6789bce..cb8df2f8b9 100644 --- a/tests/cases/standalone/common/types/json/json2.sql +++ b/tests/cases/standalone/common/types/json/json2.sql @@ -46,8 +46,17 @@ select j.a, j.a.x from json2_table order by ts; select j.c, j.y from json2_table order by ts; +select j from json2_table order by ts; + +select * from json2_table order by ts; + select j.a.b + 1 from json2_table order by ts; +select abs(j.a.b) from json2_table order by ts; + +-- "j.c" is of type "String", "abs" is expected to be all "null"s. +select abs(j.c) from json2_table order by ts; + select j.d from json2_table order by ts; drop table json2_table;