mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-22 22:20:02 +00:00
Compare commits
35 Commits
v0.18.0-ni
...
v0.17.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4bb9ceb63b | ||
|
|
38456638f8 | ||
|
|
97c0b1f5c1 | ||
|
|
4fc7f12360 | ||
|
|
ed17997449 | ||
|
|
849ae8ebb6 | ||
|
|
a0587e2e87 | ||
|
|
1ed71169ac | ||
|
|
e62f0e2b64 | ||
|
|
f92e753a34 | ||
|
|
a22b016f90 | ||
|
|
7a9fa99069 | ||
|
|
d808e7be7e | ||
|
|
8e22fcfd5c | ||
|
|
26729c31a6 | ||
|
|
b73617eaba | ||
|
|
3b909f63e3 | ||
|
|
0d4e07eddd | ||
|
|
b94ce9019d | ||
|
|
3dcd40c4ba | ||
|
|
a67803d0e9 | ||
|
|
aa7e7942f8 | ||
|
|
f1b7581dc3 | ||
|
|
cd761df369 | ||
|
|
0cea6ae64d | ||
|
|
8bf772fb50 | ||
|
|
9c1240921d | ||
|
|
eb52129a91 | ||
|
|
a0a2b40cbe | ||
|
|
067c4458d6 | ||
|
|
4e9c31bf5c | ||
|
|
9320a6ddaa | ||
|
|
4c9fcb7dee | ||
|
|
9dc16772fe | ||
|
|
6ee91f6af4 |
@@ -1,3 +1,8 @@
|
||||
logging:
|
||||
level: "info"
|
||||
format: "json"
|
||||
filters:
|
||||
- log_store=debug
|
||||
meta:
|
||||
configData: |-
|
||||
[runtime]
|
||||
|
||||
40
.github/scripts/deploy-greptimedb.sh
vendored
40
.github/scripts/deploy-greptimedb.sh
vendored
@@ -3,12 +3,14 @@
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
KUBERNETES_VERSION="${KUBERNETES_VERSION:-v1.24.0}"
|
||||
KUBERNETES_VERSION="${KUBERNETES_VERSION:-v1.32.0}"
|
||||
ENABLE_STANDALONE_MODE="${ENABLE_STANDALONE_MODE:-true}"
|
||||
DEFAULT_INSTALL_NAMESPACE=${DEFAULT_INSTALL_NAMESPACE:-default}
|
||||
GREPTIMEDB_IMAGE_TAG=${GREPTIMEDB_IMAGE_TAG:-latest}
|
||||
ETCD_CHART="oci://registry-1.docker.io/bitnamicharts/etcd"
|
||||
GREPTIME_CHART="https://greptimeteam.github.io/helm-charts/"
|
||||
ETCD_CHART="oci://registry-1.docker.io/bitnamicharts/etcd"
|
||||
ETCD_CHART_VERSION="${ETCD_CHART_VERSION:-12.0.8}"
|
||||
ETCD_IMAGE_TAG="${ETCD_IMAGE_TAG:-3.6.1-debian-12-r3}"
|
||||
|
||||
# Create a cluster with 1 control-plane node and 5 workers.
|
||||
function create_kind_cluster() {
|
||||
@@ -35,10 +37,16 @@ function add_greptime_chart() {
|
||||
function deploy_etcd_cluster() {
|
||||
local namespace="$1"
|
||||
|
||||
helm install etcd "$ETCD_CHART" \
|
||||
helm upgrade --install etcd "$ETCD_CHART" \
|
||||
--version "$ETCD_CHART_VERSION" \
|
||||
--create-namespace \
|
||||
--set replicaCount=3 \
|
||||
--set auth.rbac.create=false \
|
||||
--set auth.rbac.token.enabled=false \
|
||||
--set global.security.allowInsecureImages=true \
|
||||
--set image.registry=docker.io \
|
||||
--set image.repository=greptime/etcd \
|
||||
--set image.tag="$ETCD_IMAGE_TAG" \
|
||||
-n "$namespace"
|
||||
|
||||
# Wait for etcd cluster to be ready.
|
||||
@@ -48,7 +56,8 @@ function deploy_etcd_cluster() {
|
||||
# Deploy greptimedb-operator.
|
||||
function deploy_greptimedb_operator() {
|
||||
# Use the latest chart and image.
|
||||
helm install greptimedb-operator greptime/greptimedb-operator \
|
||||
helm upgrade --install greptimedb-operator greptime/greptimedb-operator \
|
||||
--create-namespace \
|
||||
--set image.tag=latest \
|
||||
-n "$DEFAULT_INSTALL_NAMESPACE"
|
||||
|
||||
@@ -66,9 +75,11 @@ function deploy_greptimedb_cluster() {
|
||||
|
||||
deploy_etcd_cluster "$install_namespace"
|
||||
|
||||
helm install "$cluster_name" greptime/greptimedb-cluster \
|
||||
helm upgrade --install "$cluster_name" greptime/greptimedb-cluster \
|
||||
--create-namespace \
|
||||
--set image.tag="$GREPTIMEDB_IMAGE_TAG" \
|
||||
--set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
|
||||
--set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
|
||||
-n "$install_namespace"
|
||||
|
||||
# Wait for greptimedb cluster to be ready.
|
||||
@@ -101,15 +112,17 @@ function deploy_greptimedb_cluster_with_s3_storage() {
|
||||
|
||||
deploy_etcd_cluster "$install_namespace"
|
||||
|
||||
helm install "$cluster_name" greptime/greptimedb-cluster -n "$install_namespace" \
|
||||
helm upgrade --install "$cluster_name" greptime/greptimedb-cluster -n "$install_namespace" \
|
||||
--create-namespace \
|
||||
--set image.tag="$GREPTIMEDB_IMAGE_TAG" \
|
||||
--set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
|
||||
--set storage.s3.bucket="$AWS_CI_TEST_BUCKET" \
|
||||
--set storage.s3.region="$AWS_REGION" \
|
||||
--set storage.s3.root="$DATA_ROOT" \
|
||||
--set storage.credentials.secretName=s3-credentials \
|
||||
--set storage.credentials.accessKeyId="$AWS_ACCESS_KEY_ID" \
|
||||
--set storage.credentials.secretAccessKey="$AWS_SECRET_ACCESS_KEY"
|
||||
--set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
|
||||
--set objectStorage.s3.bucket="$AWS_CI_TEST_BUCKET" \
|
||||
--set objectStorage.s3.region="$AWS_REGION" \
|
||||
--set objectStorage.s3.root="$DATA_ROOT" \
|
||||
--set objectStorage.credentials.secretName=s3-credentials \
|
||||
--set objectStorage.credentials.accessKeyId="$AWS_ACCESS_KEY_ID" \
|
||||
--set objectStorage.credentials.secretAccessKey="$AWS_SECRET_ACCESS_KEY"
|
||||
|
||||
# Wait for greptimedb cluster to be ready.
|
||||
while true; do
|
||||
@@ -134,7 +147,8 @@ function deploy_greptimedb_cluster_with_s3_storage() {
|
||||
# Deploy standalone greptimedb.
|
||||
# It will expose cluster service ports as '34000', '34001', '34002', '34003' to local access.
|
||||
function deploy_standalone_greptimedb() {
|
||||
helm install greptimedb-standalone greptime/greptimedb-standalone \
|
||||
helm upgrade --install greptimedb-standalone greptime/greptimedb-standalone \
|
||||
--create-namespace \
|
||||
--set image.tag="$GREPTIMEDB_IMAGE_TAG" \
|
||||
-n "$DEFAULT_INSTALL_NAMESPACE"
|
||||
|
||||
|
||||
6
.github/workflows/semantic-pull-request.yml
vendored
6
.github/workflows/semantic-pull-request.yml
vendored
@@ -1,7 +1,7 @@
|
||||
name: "Semantic Pull Request"
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
pull_request_target:
|
||||
types:
|
||||
- opened
|
||||
- reopened
|
||||
@@ -12,9 +12,9 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
issues: write
|
||||
contents: write
|
||||
contents: read
|
||||
pull-requests: write
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
check:
|
||||
|
||||
161
Cargo.lock
generated
161
Cargo.lock
generated
@@ -218,7 +218,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
|
||||
|
||||
[[package]]
|
||||
name = "api"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-base",
|
||||
"common-decimal",
|
||||
@@ -737,7 +737,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "auth"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -1387,7 +1387,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cache"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"catalog",
|
||||
"common-error",
|
||||
@@ -1422,7 +1422,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "catalog"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow",
|
||||
@@ -1763,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
|
||||
|
||||
[[package]]
|
||||
name = "cli"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -1807,7 +1807,7 @@ dependencies = [
|
||||
"session",
|
||||
"snafu 0.8.6",
|
||||
"store-api",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
@@ -1816,7 +1816,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "client"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
@@ -1848,7 +1848,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"snafu 0.8.6",
|
||||
"store-api",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"substrait 0.37.3",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
@@ -1889,7 +1889,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cmd"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"auth",
|
||||
@@ -1951,7 +1951,7 @@ dependencies = [
|
||||
"snafu 0.8.6",
|
||||
"stat",
|
||||
"store-api",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"temp-env",
|
||||
"tempfile",
|
||||
@@ -1997,7 +1997,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
|
||||
|
||||
[[package]]
|
||||
name = "common-base"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"anymap2",
|
||||
"async-trait",
|
||||
@@ -2019,11 +2019,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-catalog"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
|
||||
[[package]]
|
||||
name = "common-config"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-base",
|
||||
"common-error",
|
||||
@@ -2049,7 +2049,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-datasource"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-schema",
|
||||
@@ -2084,7 +2084,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-decimal"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"bigdecimal 0.4.8",
|
||||
"common-error",
|
||||
@@ -2097,7 +2097,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-error"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-macro",
|
||||
"http 1.3.1",
|
||||
@@ -2108,7 +2108,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-event-recorder"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -2130,7 +2130,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-frontend"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -2152,7 +2152,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-function"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"api",
|
||||
@@ -2210,7 +2210,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-greptimedb-telemetry"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"common-runtime",
|
||||
@@ -2227,7 +2227,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-grpc"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-flight",
|
||||
@@ -2260,7 +2260,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-grpc-expr"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"common-base",
|
||||
@@ -2280,7 +2280,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-macro"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"greptime-proto",
|
||||
"once_cell",
|
||||
@@ -2291,7 +2291,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-mem-prof"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"common-error",
|
||||
@@ -2307,7 +2307,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-meta"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"anymap2",
|
||||
"api",
|
||||
@@ -2379,7 +2379,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-options"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-grpc",
|
||||
"humantime-serde",
|
||||
@@ -2388,11 +2388,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-plugins"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
|
||||
[[package]]
|
||||
name = "common-pprof"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-error",
|
||||
"common-macro",
|
||||
@@ -2404,7 +2404,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-procedure"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-stream",
|
||||
@@ -2433,7 +2433,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-procedure-test"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"common-procedure",
|
||||
@@ -2443,7 +2443,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-query"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -2468,7 +2468,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-recordbatch"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"common-error",
|
||||
@@ -2489,7 +2489,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-runtime"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"clap 4.5.40",
|
||||
@@ -2518,7 +2518,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-session"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"strum 0.27.1",
|
||||
@@ -2526,7 +2526,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-sql"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-base",
|
||||
"common-decimal",
|
||||
@@ -2544,7 +2544,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-telemetry"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"common-base",
|
||||
@@ -2573,7 +2573,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-test-util"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"client",
|
||||
"common-grpc",
|
||||
@@ -2586,7 +2586,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-time"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
@@ -2604,7 +2604,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-version"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"build-data",
|
||||
"cargo-manifest",
|
||||
@@ -2615,7 +2615,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-wal"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-base",
|
||||
"common-error",
|
||||
@@ -2638,7 +2638,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-workload"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"common-telemetry",
|
||||
"serde",
|
||||
@@ -3865,7 +3865,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datanode"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-flight",
|
||||
@@ -3918,7 +3918,7 @@ dependencies = [
|
||||
"session",
|
||||
"snafu 0.8.6",
|
||||
"store-api",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"tokio",
|
||||
"toml 0.8.23",
|
||||
@@ -3928,7 +3928,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datatypes"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4602,7 +4602,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "file-engine"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -4734,7 +4734,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
|
||||
|
||||
[[package]]
|
||||
name = "flow"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow",
|
||||
@@ -4801,7 +4801,7 @@ dependencies = [
|
||||
"sql",
|
||||
"store-api",
|
||||
"strum 0.27.1",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"tokio",
|
||||
"tonic 0.13.1",
|
||||
@@ -4856,7 +4856,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619"
|
||||
|
||||
[[package]]
|
||||
name = "frontend"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
@@ -4919,7 +4919,7 @@ dependencies = [
|
||||
"sqlparser 0.55.0-greptime",
|
||||
"store-api",
|
||||
"strfmt",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -6061,7 +6061,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "index"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"asynchronous-codec",
|
||||
@@ -7001,7 +7001,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
|
||||
|
||||
[[package]]
|
||||
name = "log-query"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"common-error",
|
||||
@@ -7013,7 +7013,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "log-store"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -7284,8 +7284,7 @@ checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
|
||||
[[package]]
|
||||
name = "memcomparable"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "376101dbd964fc502d5902216e180f92b3d003b5cc3d2e40e044eb5470fca677"
|
||||
source = "git+https://github.com/v0y4g3r/memcomparable.git?rev=a07122dc03556bbd88ad66234cbea7efd3b23efb#a07122dc03556bbd88ad66234cbea7efd3b23efb"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"serde",
|
||||
@@ -7321,7 +7320,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meta-client"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -7349,7 +7348,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meta-srv"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -7445,7 +7444,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "metric-engine"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"aquamarine",
|
||||
@@ -7538,7 +7537,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mito-codec"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"bytes",
|
||||
@@ -7562,7 +7561,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mito2"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"aquamarine",
|
||||
@@ -7603,7 +7602,6 @@ dependencies = [
|
||||
"itertools 0.14.0",
|
||||
"lazy_static",
|
||||
"log-store",
|
||||
"memcomparable",
|
||||
"mito-codec",
|
||||
"moka",
|
||||
"object-store",
|
||||
@@ -8297,7 +8295,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "object-store"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
@@ -8582,7 +8580,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "operator"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"api",
|
||||
@@ -8640,7 +8638,7 @@ dependencies = [
|
||||
"sql",
|
||||
"sqlparser 0.55.0-greptime",
|
||||
"store-api",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -8952,7 +8950,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "partition"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -9291,7 +9289,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "pipeline"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"api",
|
||||
@@ -9447,7 +9445,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "plugins"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"auth",
|
||||
"clap 4.5.40",
|
||||
@@ -9745,7 +9743,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "promql"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"async-trait",
|
||||
@@ -10028,7 +10026,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "puffin"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-compression 0.4.19",
|
||||
"async-trait",
|
||||
@@ -10070,7 +10068,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "query"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"api",
|
||||
@@ -10135,7 +10133,7 @@ dependencies = [
|
||||
"sql",
|
||||
"sqlparser 0.55.0-greptime",
|
||||
"store-api",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
@@ -11499,7 +11497,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "servers"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"api",
|
||||
@@ -11622,7 +11620,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "session"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"api",
|
||||
@@ -11950,7 +11948,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sql"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-buffer",
|
||||
@@ -12008,7 +12006,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sqlness-runner"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"clap 4.5.40",
|
||||
@@ -12308,7 +12306,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "stat"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"nix 0.30.1",
|
||||
]
|
||||
@@ -12321,7 +12319,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||
|
||||
[[package]]
|
||||
name = "store-api"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"aquamarine",
|
||||
@@ -12468,11 +12466,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "substrait"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"common-error",
|
||||
"common-function",
|
||||
"common-macro",
|
||||
"common-telemetry",
|
||||
"datafusion",
|
||||
@@ -12636,7 +12635,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "table"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -12905,7 +12904,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
|
||||
|
||||
[[package]]
|
||||
name = "tests-fuzz"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"async-trait",
|
||||
@@ -12949,7 +12948,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tests-integration"
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-flight",
|
||||
@@ -13021,7 +13020,7 @@ dependencies = [
|
||||
"sql",
|
||||
"sqlx",
|
||||
"store-api",
|
||||
"substrait 0.17.0",
|
||||
"substrait 0.17.2",
|
||||
"table",
|
||||
"tempfile",
|
||||
"time",
|
||||
|
||||
@@ -73,7 +73,7 @@ members = [
|
||||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
version = "0.17.0"
|
||||
version = "0.17.2"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
|
||||
@@ -402,8 +402,8 @@
|
||||
| `event_recorder` | -- | -- | Configuration options for the event recorder. |
|
||||
| `event_recorder.ttl` | String | `90d` | TTL for the events table that will be used to store the events. Default is `90d`. |
|
||||
| `stats_persistence` | -- | -- | Configuration options for the stats persistence. |
|
||||
| `stats_persistence.ttl` | String | `30d` | TTL for the stats table that will be used to store the stats. Default is `30d`.<br/>Set to `0s` to disable stats persistence. |
|
||||
| `stats_persistence.interval` | String | `60s` | The interval to persist the stats. Default is `60s`.<br/>The minimum value is `60s`, if the value is less than `60s`, it will be overridden to `60s`. |
|
||||
| `stats_persistence.ttl` | String | `0s` | TTL for the stats table that will be used to store the stats.<br/>Set to `0s` to disable stats persistence.<br/>Default is `0s`.<br/>If you want to enable stats persistence, set the TTL to a value greater than 0.<br/>It is recommended to set a small value, e.g., `3h`. |
|
||||
| `stats_persistence.interval` | String | `10m` | The interval to persist the stats. Default is `10m`.<br/>The minimum value is `10m`, if the value is less than `10m`, it will be overridden to `10m`. |
|
||||
| `logging` | -- | -- | The logging options. |
|
||||
| `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
|
||||
| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
|
||||
|
||||
@@ -274,12 +274,15 @@ ttl = "90d"
|
||||
|
||||
## Configuration options for the stats persistence.
|
||||
[stats_persistence]
|
||||
## TTL for the stats table that will be used to store the stats. Default is `30d`.
|
||||
## TTL for the stats table that will be used to store the stats.
|
||||
## Set to `0s` to disable stats persistence.
|
||||
ttl = "30d"
|
||||
## The interval to persist the stats. Default is `60s`.
|
||||
## The minimum value is `60s`, if the value is less than `60s`, it will be overridden to `60s`.
|
||||
interval = "60s"
|
||||
## Default is `0s`.
|
||||
## If you want to enable stats persistence, set the TTL to a value greater than 0.
|
||||
## It is recommended to set a small value, e.g., `3h`.
|
||||
ttl = "0s"
|
||||
## The interval to persist the stats. Default is `10m`.
|
||||
## The minimum value is `10m`, if the value is less than `10m`, it will be overridden to `10m`.
|
||||
interval = "10m"
|
||||
|
||||
## The logging options.
|
||||
[logging]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -87,6 +87,13 @@
|
||||
| Other Request P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation!~"read\|write\|list\|Writer::write\|Writer::close\|Reader::read"}[$__rate_interval])))` | `timeseries` | Other Request P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
|
||||
| Opendal traffic | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_bytes_sum{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Total traffic as in bytes by instance and operation | `prometheus` | `decbytes` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
|
||||
| OpenDAL errors per Instance | `sum by(instance, pod, scheme, operation, error) (rate(opendal_operation_errors_total{instance=~"$datanode", error!="NotFound"}[$__rate_interval]))` | `timeseries` | OpenDAL error counts per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]-[{{error}}]` |
|
||||
# Remote WAL
|
||||
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| Triggered region flush total | `meta_triggered_region_flush_total` | `timeseries` | Triggered region flush total | `prometheus` | `none` | `{{pod}}-{{topic_name}}` |
|
||||
| Triggered region checkpoint total | `meta_triggered_region_checkpoint_total` | `timeseries` | Triggered region checkpoint total | `prometheus` | `none` | `{{pod}}-{{topic_name}}` |
|
||||
| Topic estimated replay size | `meta_topic_estimated_replay_size` | `timeseries` | Topic estimated max replay size | `prometheus` | `bytes` | `{{pod}}-{{topic_name}}` |
|
||||
| Kafka logstore's bytes traffic | `rate(greptime_logstore_kafka_client_bytes_total[$__rate_interval])` | `timeseries` | Kafka logstore's bytes traffic | `prometheus` | `bytes` | `{{pod}}-{{logstore}}` |
|
||||
# Metasrv
|
||||
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
@@ -103,6 +110,8 @@
|
||||
| Meta KV Ops Latency | `histogram_quantile(0.99, sum by(pod, le, op, target) (greptime_meta_kv_request_elapsed_bucket))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `{{pod}}-{{op}} p99` |
|
||||
| Rate of meta KV Ops | `rate(greptime_meta_kv_request_elapsed_count[$__rate_interval])` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `none` | `{{pod}}-{{op}} p99` |
|
||||
| DDL Latency | `histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_tables_bucket))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_view))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_flow))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_drop_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_alter_table))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `CreateLogicalTables-{{step}} p90` |
|
||||
| Reconciliation stats | `greptime_meta_reconciliation_stats` | `timeseries` | Reconciliation stats | `prometheus` | `s` | `{{pod}}-{{table_type}}-{{type}}` |
|
||||
| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` |
|
||||
# Flownode
|
||||
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
|
||||
@@ -802,6 +802,48 @@ groups:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]-[{{error}}]'
|
||||
- title: Remote WAL
|
||||
panels:
|
||||
- title: Triggered region flush total
|
||||
type: timeseries
|
||||
description: Triggered region flush total
|
||||
unit: none
|
||||
queries:
|
||||
- expr: meta_triggered_region_flush_total
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{topic_name}}'
|
||||
- title: Triggered region checkpoint total
|
||||
type: timeseries
|
||||
description: Triggered region checkpoint total
|
||||
unit: none
|
||||
queries:
|
||||
- expr: meta_triggered_region_checkpoint_total
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{topic_name}}'
|
||||
- title: Topic estimated replay size
|
||||
type: timeseries
|
||||
description: Topic estimated max replay size
|
||||
unit: bytes
|
||||
queries:
|
||||
- expr: meta_topic_estimated_replay_size
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{topic_name}}'
|
||||
- title: Kafka logstore's bytes traffic
|
||||
type: timeseries
|
||||
description: Kafka logstore's bytes traffic
|
||||
unit: bytes
|
||||
queries:
|
||||
- expr: rate(greptime_logstore_kafka_client_bytes_total[$__rate_interval])
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{logstore}}'
|
||||
- title: Metasrv
|
||||
panels:
|
||||
- title: Region migration datanode
|
||||
@@ -948,6 +990,26 @@ groups:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: AlterTable-{{step}} p90
|
||||
- title: Reconciliation stats
|
||||
type: timeseries
|
||||
description: Reconciliation stats
|
||||
unit: s
|
||||
queries:
|
||||
- expr: greptime_meta_reconciliation_stats
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{table_type}}-{{type}}'
|
||||
- title: Reconciliation steps
|
||||
type: timeseries
|
||||
description: 'Elapsed of Reconciliation steps '
|
||||
unit: s
|
||||
queries:
|
||||
- expr: histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{procedure_name}}-{{step}}-P90'
|
||||
- title: Flownode
|
||||
panels:
|
||||
- title: Flow Ingest / Output Rate
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -87,6 +87,13 @@
|
||||
| Other Request P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{ operation!~"read\|write\|list\|Writer::write\|Writer::close\|Reader::read"}[$__rate_interval])))` | `timeseries` | Other Request P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
|
||||
| Opendal traffic | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_bytes_sum{}[$__rate_interval]))` | `timeseries` | Total traffic as in bytes by instance and operation | `prometheus` | `decbytes` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
|
||||
| OpenDAL errors per Instance | `sum by(instance, pod, scheme, operation, error) (rate(opendal_operation_errors_total{ error!="NotFound"}[$__rate_interval]))` | `timeseries` | OpenDAL error counts per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]-[{{error}}]` |
|
||||
# Remote WAL
|
||||
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| Triggered region flush total | `meta_triggered_region_flush_total` | `timeseries` | Triggered region flush total | `prometheus` | `none` | `{{pod}}-{{topic_name}}` |
|
||||
| Triggered region checkpoint total | `meta_triggered_region_checkpoint_total` | `timeseries` | Triggered region checkpoint total | `prometheus` | `none` | `{{pod}}-{{topic_name}}` |
|
||||
| Topic estimated replay size | `meta_topic_estimated_replay_size` | `timeseries` | Topic estimated max replay size | `prometheus` | `bytes` | `{{pod}}-{{topic_name}}` |
|
||||
| Kafka logstore's bytes traffic | `rate(greptime_logstore_kafka_client_bytes_total[$__rate_interval])` | `timeseries` | Kafka logstore's bytes traffic | `prometheus` | `bytes` | `{{pod}}-{{logstore}}` |
|
||||
# Metasrv
|
||||
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
@@ -103,6 +110,8 @@
|
||||
| Meta KV Ops Latency | `histogram_quantile(0.99, sum by(pod, le, op, target) (greptime_meta_kv_request_elapsed_bucket))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `{{pod}}-{{op}} p99` |
|
||||
| Rate of meta KV Ops | `rate(greptime_meta_kv_request_elapsed_count[$__rate_interval])` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `none` | `{{pod}}-{{op}} p99` |
|
||||
| DDL Latency | `histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_tables_bucket))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_view))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_flow))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_drop_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_alter_table))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `CreateLogicalTables-{{step}} p90` |
|
||||
| Reconciliation stats | `greptime_meta_reconciliation_stats` | `timeseries` | Reconciliation stats | `prometheus` | `s` | `{{pod}}-{{table_type}}-{{type}}` |
|
||||
| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` |
|
||||
# Flownode
|
||||
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
|
||||
@@ -802,6 +802,48 @@ groups:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]-[{{error}}]'
|
||||
- title: Remote WAL
|
||||
panels:
|
||||
- title: Triggered region flush total
|
||||
type: timeseries
|
||||
description: Triggered region flush total
|
||||
unit: none
|
||||
queries:
|
||||
- expr: meta_triggered_region_flush_total
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{topic_name}}'
|
||||
- title: Triggered region checkpoint total
|
||||
type: timeseries
|
||||
description: Triggered region checkpoint total
|
||||
unit: none
|
||||
queries:
|
||||
- expr: meta_triggered_region_checkpoint_total
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{topic_name}}'
|
||||
- title: Topic estimated replay size
|
||||
type: timeseries
|
||||
description: Topic estimated max replay size
|
||||
unit: bytes
|
||||
queries:
|
||||
- expr: meta_topic_estimated_replay_size
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{topic_name}}'
|
||||
- title: Kafka logstore's bytes traffic
|
||||
type: timeseries
|
||||
description: Kafka logstore's bytes traffic
|
||||
unit: bytes
|
||||
queries:
|
||||
- expr: rate(greptime_logstore_kafka_client_bytes_total[$__rate_interval])
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{logstore}}'
|
||||
- title: Metasrv
|
||||
panels:
|
||||
- title: Region migration datanode
|
||||
@@ -948,6 +990,26 @@ groups:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: AlterTable-{{step}} p90
|
||||
- title: Reconciliation stats
|
||||
type: timeseries
|
||||
description: Reconciliation stats
|
||||
unit: s
|
||||
queries:
|
||||
- expr: greptime_meta_reconciliation_stats
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{pod}}-{{table_type}}-{{type}}'
|
||||
- title: Reconciliation steps
|
||||
type: timeseries
|
||||
description: 'Elapsed of Reconciliation steps '
|
||||
unit: s
|
||||
queries:
|
||||
- expr: histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: ${metrics}
|
||||
legendFormat: '{{procedure_name}}-{{step}}-P90'
|
||||
- title: Flownode
|
||||
panels:
|
||||
- title: Flow Ingest / Output Rate
|
||||
|
||||
@@ -26,12 +26,11 @@ use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
|
||||
use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
|
||||
use datatypes::prelude::{ConcreteDataType, ScalarVectorBuilder, VectorRef};
|
||||
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
|
||||
use datatypes::timestamp::TimestampMicrosecond;
|
||||
use datatypes::timestamp::TimestampSecond;
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::{
|
||||
ConstantVector, Int64Vector, Int64VectorBuilder, MutableVector, StringVector,
|
||||
StringVectorBuilder, TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder,
|
||||
UInt64VectorBuilder,
|
||||
StringVectorBuilder, TimestampSecondVector, TimestampSecondVectorBuilder, UInt64VectorBuilder,
|
||||
};
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use partition::manager::PartitionInfo;
|
||||
@@ -129,17 +128,17 @@ impl InformationSchemaPartitions {
|
||||
ColumnSchema::new("data_free", ConcreteDataType::int64_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"create_time",
|
||||
ConcreteDataType::timestamp_microsecond_datatype(),
|
||||
ConcreteDataType::timestamp_second_datatype(),
|
||||
true,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
"update_time",
|
||||
ConcreteDataType::timestamp_microsecond_datatype(),
|
||||
ConcreteDataType::timestamp_second_datatype(),
|
||||
true,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
"check_time",
|
||||
ConcreteDataType::timestamp_microsecond_datatype(),
|
||||
ConcreteDataType::timestamp_second_datatype(),
|
||||
true,
|
||||
),
|
||||
ColumnSchema::new("checksum", ConcreteDataType::int64_datatype(), true),
|
||||
@@ -212,7 +211,7 @@ struct InformationSchemaPartitionsBuilder {
|
||||
partition_names: StringVectorBuilder,
|
||||
partition_ordinal_positions: Int64VectorBuilder,
|
||||
partition_expressions: StringVectorBuilder,
|
||||
create_times: TimestampMicrosecondVectorBuilder,
|
||||
create_times: TimestampSecondVectorBuilder,
|
||||
partition_ids: UInt64VectorBuilder,
|
||||
}
|
||||
|
||||
@@ -232,7 +231,7 @@ impl InformationSchemaPartitionsBuilder {
|
||||
partition_names: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
partition_ordinal_positions: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
partition_expressions: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
create_times: TimestampMicrosecondVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
create_times: TimestampSecondVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
partition_ids: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
}
|
||||
}
|
||||
@@ -331,8 +330,8 @@ impl InformationSchemaPartitionsBuilder {
|
||||
.push(Some((index + 1) as i64));
|
||||
let expression = partition.partition_expr.as_ref().map(|e| e.to_string());
|
||||
self.partition_expressions.push(expression.as_deref());
|
||||
self.create_times.push(Some(TimestampMicrosecond::from(
|
||||
table_info.meta.created_on.timestamp_millis(),
|
||||
self.create_times.push(Some(TimestampSecond::from(
|
||||
table_info.meta.created_on.timestamp(),
|
||||
)));
|
||||
self.partition_ids.push(Some(partition.id.as_u64()));
|
||||
}
|
||||
@@ -349,8 +348,8 @@ impl InformationSchemaPartitionsBuilder {
|
||||
Arc::new(Int64Vector::from(vec![None])),
|
||||
rows_num,
|
||||
));
|
||||
let null_timestampmicrosecond_vector = Arc::new(ConstantVector::new(
|
||||
Arc::new(TimestampMicrosecondVector::from(vec![None])),
|
||||
let null_timestamp_second_vector = Arc::new(ConstantVector::new(
|
||||
Arc::new(TimestampSecondVector::from(vec![None])),
|
||||
rows_num,
|
||||
));
|
||||
let partition_methods = Arc::new(ConstantVector::new(
|
||||
@@ -380,8 +379,8 @@ impl InformationSchemaPartitionsBuilder {
|
||||
null_i64_vector.clone(),
|
||||
Arc::new(self.create_times.finish()),
|
||||
// TODO(dennis): supports update_time
|
||||
null_timestampmicrosecond_vector.clone(),
|
||||
null_timestampmicrosecond_vector,
|
||||
null_timestamp_second_vector.clone(),
|
||||
null_timestamp_second_vector,
|
||||
null_i64_vector,
|
||||
null_string_vector.clone(),
|
||||
null_string_vector.clone(),
|
||||
|
||||
@@ -83,6 +83,20 @@ pub(crate) struct StoreConfig {
|
||||
}
|
||||
|
||||
impl StoreConfig {
|
||||
pub fn tls_config(&self) -> Option<TlsOption> {
|
||||
if self.backend_tls_mode != TlsMode::Disable {
|
||||
Some(TlsOption {
|
||||
mode: self.backend_tls_mode.clone(),
|
||||
cert_path: self.backend_tls_cert_path.clone(),
|
||||
key_path: self.backend_tls_key_path.clone(),
|
||||
ca_cert_path: self.backend_tls_ca_cert_path.clone(),
|
||||
watch: self.backend_tls_watch,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a [`KvBackendRef`] from the store configuration.
|
||||
pub async fn build(&self) -> Result<KvBackendRef, BoxedError> {
|
||||
let max_txn_ops = self.max_txn_ops;
|
||||
@@ -92,17 +106,7 @@ impl StoreConfig {
|
||||
} else {
|
||||
let kvbackend = match self.backend {
|
||||
BackendImpl::EtcdStore => {
|
||||
let tls_config = if self.backend_tls_mode != TlsMode::Disable {
|
||||
Some(TlsOption {
|
||||
mode: self.backend_tls_mode.clone(),
|
||||
cert_path: self.backend_tls_cert_path.clone(),
|
||||
key_path: self.backend_tls_key_path.clone(),
|
||||
ca_cert_path: self.backend_tls_ca_cert_path.clone(),
|
||||
watch: self.backend_tls_watch,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let tls_config = self.tls_config();
|
||||
let etcd_client = create_etcd_client_with_tls(store_addrs, tls_config.as_ref())
|
||||
.await
|
||||
.map_err(BoxedError::new)?;
|
||||
@@ -111,7 +115,8 @@ impl StoreConfig {
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
BackendImpl::PostgresStore => {
|
||||
let table_name = &self.meta_table_name;
|
||||
let pool = meta_srv::bootstrap::create_postgres_pool(store_addrs, None)
|
||||
let tls_config = self.tls_config();
|
||||
let pool = meta_srv::bootstrap::create_postgres_pool(store_addrs, tls_config)
|
||||
.await
|
||||
.map_err(BoxedError::new)?;
|
||||
let schema_name = self.meta_schema_name.as_deref();
|
||||
|
||||
@@ -196,7 +196,10 @@ pub async fn stream_to_parquet(
|
||||
concurrency: usize,
|
||||
) -> Result<usize> {
|
||||
let write_props = column_wise_config(
|
||||
WriterProperties::builder().set_compression(Compression::ZSTD(ZstdLevel::default())),
|
||||
WriterProperties::builder()
|
||||
.set_compression(Compression::ZSTD(ZstdLevel::default()))
|
||||
.set_statistics_truncate_length(None)
|
||||
.set_column_index_truncate_length(None),
|
||||
schema,
|
||||
)
|
||||
.build();
|
||||
|
||||
@@ -251,7 +251,6 @@ macro_rules! define_from_tonic_status {
|
||||
.get(key)
|
||||
.and_then(|v| String::from_utf8(v.as_bytes().to_vec()).ok())
|
||||
}
|
||||
|
||||
let code = metadata_value(&e, $crate::GREPTIME_DB_HEADER_ERROR_CODE)
|
||||
.and_then(|s| {
|
||||
if let Ok(code) = s.parse::<u32>() {
|
||||
|
||||
@@ -41,7 +41,12 @@ use datafusion_expr::{
|
||||
use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
|
||||
use datatypes::arrow::datatypes::{DataType, Field};
|
||||
|
||||
use crate::function_registry::FunctionRegistry;
|
||||
use crate::aggrs::aggr_wrapper::fix_order::FixStateUdafOrderingAnalyzer;
|
||||
use crate::function_registry::{FunctionRegistry, FUNCTION_REGISTRY};
|
||||
|
||||
pub mod fix_order;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
/// Returns the name of the state function for the given aggregate function name.
|
||||
/// The state function is used to compute the state of the aggregate function.
|
||||
@@ -57,6 +62,39 @@ pub fn aggr_merge_func_name(aggr_name: &str) -> String {
|
||||
format!("__{}_merge", aggr_name)
|
||||
}
|
||||
|
||||
/// Check if the given aggregate expression is steppable.
|
||||
/// As in if it can be split into multiple steps:
|
||||
/// i.e. on datanode first call `state(input)` then
|
||||
/// on frontend call `calc(merge(state))` to get the final result.
|
||||
pub fn is_all_aggr_exprs_steppable(aggr_exprs: &[Expr]) -> bool {
|
||||
aggr_exprs.iter().all(|expr| {
|
||||
if let Some(aggr_func) = get_aggr_func(expr) {
|
||||
if aggr_func.params.distinct {
|
||||
// Distinct aggregate functions are not steppable(yet).
|
||||
// TODO(discord9): support distinct aggregate functions.
|
||||
return false;
|
||||
}
|
||||
|
||||
// whether the corresponding state function exists in the registry
|
||||
FUNCTION_REGISTRY.is_aggr_func_exist(&aggr_state_func_name(aggr_func.func.name()))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_aggr_func(expr: &Expr) -> Option<&datafusion_expr::expr::AggregateFunction> {
|
||||
let mut expr_ref = expr;
|
||||
while let Expr::Alias(alias) = expr_ref {
|
||||
expr_ref = &alias.expr;
|
||||
}
|
||||
if let Expr::AggregateFunction(aggr_func) = expr_ref {
|
||||
Some(aggr_func)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper to make an aggregate function out of the state and merge functions of the original aggregate function.
|
||||
/// It contains the original aggregate function, the state functions, and the merge function.
|
||||
///
|
||||
@@ -74,18 +112,6 @@ pub struct StepAggrPlan {
|
||||
pub lower_state: LogicalPlan,
|
||||
}
|
||||
|
||||
pub fn get_aggr_func(expr: &Expr) -> Option<&datafusion_expr::expr::AggregateFunction> {
|
||||
let mut expr_ref = expr;
|
||||
while let Expr::Alias(alias) = expr_ref {
|
||||
expr_ref = &alias.expr;
|
||||
}
|
||||
if let Expr::AggregateFunction(aggr_func) = expr_ref {
|
||||
Some(aggr_func)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl StateMergeHelper {
|
||||
/// Register all the `state` function of supported aggregate functions.
|
||||
/// Note that can't register `merge` function here, as it needs to be created from the original aggregate function with given input types.
|
||||
@@ -118,6 +144,7 @@ impl StateMergeHelper {
|
||||
}
|
||||
|
||||
/// Split an aggregate plan into two aggregate plans, one for the state function and one for the merge function.
|
||||
///
|
||||
pub fn split_aggr_node(aggr_plan: Aggregate) -> datafusion_common::Result<StepAggrPlan> {
|
||||
let aggr = {
|
||||
// certain aggr func need type coercion to work correctly, so we need to analyze the plan first.
|
||||
@@ -137,6 +164,15 @@ impl StateMergeHelper {
|
||||
let mut lower_aggr_exprs = vec![];
|
||||
let mut upper_aggr_exprs = vec![];
|
||||
|
||||
// group exprs for upper plan should refer to the output group expr as column from lower plan
|
||||
// to avoid re-compute group exprs again.
|
||||
let upper_group_exprs = aggr
|
||||
.group_expr
|
||||
.iter()
|
||||
.map(|c| c.qualified_name())
|
||||
.map(|(r, c)| Expr::Column(Column::new(r, c)))
|
||||
.collect();
|
||||
|
||||
for aggr_expr in aggr.aggr_expr.iter() {
|
||||
let Some(aggr_func) = get_aggr_func(aggr_expr) else {
|
||||
return Err(datafusion_common::DataFusionError::NotImplemented(format!(
|
||||
@@ -164,6 +200,7 @@ impl StateMergeHelper {
|
||||
|
||||
lower_aggr_exprs.push(expr);
|
||||
|
||||
// then create the merge function using the physical expression of the original aggregate function
|
||||
let (original_phy_expr, _filter, _ordering) = create_aggregate_expr_and_maybe_filter(
|
||||
aggr_expr,
|
||||
aggr.input.schema(),
|
||||
@@ -179,9 +216,15 @@ impl StateMergeHelper {
|
||||
let arg = Expr::Column(Column::new_unqualified(lower_state_output_col_name));
|
||||
let expr = AggregateFunction {
|
||||
func: Arc::new(merge_func.into()),
|
||||
// notice filter/order_by is not supported in the merge function, as it's not meaningful to have them in the merge phase.
|
||||
// do notice this order by is only removed in the outer logical plan, the physical plan still have order by and hence
|
||||
// can create correct accumulator with order by.
|
||||
params: AggregateFunctionParams {
|
||||
args: vec![arg],
|
||||
..aggr_func.params.clone()
|
||||
distinct: aggr_func.params.distinct,
|
||||
filter: None,
|
||||
order_by: vec![],
|
||||
null_treatment: aggr_func.params.null_treatment,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -198,10 +241,18 @@ impl StateMergeHelper {
|
||||
// update aggregate's output schema
|
||||
let lower_plan = lower_plan.recompute_schema()?;
|
||||
|
||||
let mut upper = aggr.clone();
|
||||
// should only affect two udaf `first_value/last_value`
|
||||
// which only them have meaningful order by field
|
||||
let fixed_lower_plan =
|
||||
FixStateUdafOrderingAnalyzer.analyze(lower_plan, &Default::default())?;
|
||||
|
||||
let upper = Aggregate::try_new(
|
||||
Arc::new(fixed_lower_plan.clone()),
|
||||
upper_group_exprs,
|
||||
upper_aggr_exprs.clone(),
|
||||
)?;
|
||||
let aggr_plan = LogicalPlan::Aggregate(aggr);
|
||||
upper.aggr_expr = upper_aggr_exprs;
|
||||
upper.input = Arc::new(lower_plan.clone());
|
||||
|
||||
// upper schema's output schema should be the same as the original aggregate plan's output schema
|
||||
let upper_check = upper;
|
||||
let upper_plan = LogicalPlan::Aggregate(upper_check).recompute_schema()?;
|
||||
@@ -213,7 +264,7 @@ impl StateMergeHelper {
|
||||
}
|
||||
|
||||
Ok(StepAggrPlan {
|
||||
lower_state: lower_plan,
|
||||
lower_state: fixed_lower_plan,
|
||||
upper_merge: upper_plan,
|
||||
})
|
||||
}
|
||||
@@ -224,13 +275,22 @@ impl StateMergeHelper {
|
||||
pub struct StateWrapper {
|
||||
inner: AggregateUDF,
|
||||
name: String,
|
||||
/// Default to empty, might get fixed by analyzer later
|
||||
ordering: Vec<FieldRef>,
|
||||
/// Default to false, might get fixed by analyzer later
|
||||
distinct: bool,
|
||||
}
|
||||
|
||||
impl StateWrapper {
|
||||
/// `state_index`: The index of the state in the output of the state function.
|
||||
pub fn new(inner: AggregateUDF) -> datafusion_common::Result<Self> {
|
||||
let name = aggr_state_func_name(inner.name());
|
||||
Ok(Self { inner, name })
|
||||
Ok(Self {
|
||||
inner,
|
||||
name,
|
||||
ordering: vec![],
|
||||
distinct: false,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn inner(&self) -> &AggregateUDF {
|
||||
@@ -244,7 +304,19 @@ impl StateWrapper {
|
||||
&self,
|
||||
acc_args: &datafusion_expr::function::AccumulatorArgs,
|
||||
) -> datafusion_common::Result<FieldRef> {
|
||||
self.inner.return_field(acc_args.schema.fields())
|
||||
let input_fields = acc_args
|
||||
.exprs
|
||||
.iter()
|
||||
.map(|e| e.return_field(acc_args.schema))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
self.inner.return_field(&input_fields).inspect_err(|e| {
|
||||
common_telemetry::error!(
|
||||
"StateWrapper: {:#?}\nacc_args:{:?}\nerror:{:?}",
|
||||
&self,
|
||||
&acc_args,
|
||||
e
|
||||
);
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,6 +340,7 @@ impl AggregateUDFImpl for StateWrapper {
|
||||
};
|
||||
self.inner.accumulator(acc_args)?
|
||||
};
|
||||
|
||||
Ok(Box::new(StateAccum::new(inner, state_type)?))
|
||||
}
|
||||
|
||||
@@ -294,11 +367,22 @@ impl AggregateUDFImpl for StateWrapper {
|
||||
name: self.inner().name(),
|
||||
input_fields,
|
||||
return_field: self.inner.return_field(input_fields)?,
|
||||
// TODO(discord9): how to get this?, probably ok?
|
||||
ordering_fields: &[],
|
||||
is_distinct: false,
|
||||
// those args are also needed as they are vital to construct the state fields correctly.
|
||||
ordering_fields: &self.ordering,
|
||||
is_distinct: self.distinct,
|
||||
};
|
||||
let state_fields = self.inner.state_fields(state_fields_args)?;
|
||||
|
||||
let state_fields = state_fields
|
||||
.into_iter()
|
||||
.map(|f| {
|
||||
let mut f = f.as_ref().clone();
|
||||
// since state can be null when no input rows, so make all fields nullable
|
||||
f.set_nullable(true);
|
||||
Arc::new(f)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let struct_field = DataType::Struct(state_fields.into());
|
||||
Ok(struct_field)
|
||||
}
|
||||
@@ -363,6 +447,39 @@ impl Accumulator for StateAccum {
|
||||
.iter()
|
||||
.map(|s| s.to_array())
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
let array_type = array
|
||||
.iter()
|
||||
.map(|a| a.data_type().clone())
|
||||
.collect::<Vec<_>>();
|
||||
let expected_type: Vec<_> = self
|
||||
.state_fields
|
||||
.iter()
|
||||
.map(|f| f.data_type().clone())
|
||||
.collect();
|
||||
if array_type != expected_type {
|
||||
debug!(
|
||||
"State mismatch, expected: {}, got: {} for expected fields: {:?} and given array types: {:?}",
|
||||
self.state_fields.len(),
|
||||
array.len(),
|
||||
self.state_fields,
|
||||
array_type,
|
||||
);
|
||||
let guess_schema = array
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(index, array)| {
|
||||
Field::new(
|
||||
format!("col_{index}[mismatch_state]").as_str(),
|
||||
array.data_type().clone(),
|
||||
true,
|
||||
)
|
||||
})
|
||||
.collect::<Fields>();
|
||||
let arr = StructArray::try_new(guess_schema, array, None)?;
|
||||
|
||||
return Ok(ScalarValue::Struct(Arc::new(arr)));
|
||||
}
|
||||
|
||||
let struct_array = StructArray::try_new(self.state_fields.clone(), array, None)?;
|
||||
Ok(ScalarValue::Struct(Arc::new(struct_array)))
|
||||
}
|
||||
@@ -401,7 +518,7 @@ pub struct MergeWrapper {
|
||||
merge_signature: Signature,
|
||||
/// The original physical expression of the aggregate function, can't store the original aggregate function directly, as PhysicalExpr didn't implement Any
|
||||
original_phy_expr: Arc<AggregateFunctionExpr>,
|
||||
original_input_types: Vec<DataType>,
|
||||
return_type: DataType,
|
||||
}
|
||||
impl MergeWrapper {
|
||||
pub fn new(
|
||||
@@ -412,13 +529,14 @@ impl MergeWrapper {
|
||||
let name = aggr_merge_func_name(inner.name());
|
||||
// the input type is actually struct type, which is the state fields of the original aggregate function.
|
||||
let merge_signature = Signature::user_defined(datafusion_expr::Volatility::Immutable);
|
||||
let return_type = inner.return_type(&original_input_types)?;
|
||||
|
||||
Ok(Self {
|
||||
inner,
|
||||
name,
|
||||
merge_signature,
|
||||
original_phy_expr,
|
||||
original_input_types,
|
||||
return_type,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -470,8 +588,7 @@ impl AggregateUDFImpl for MergeWrapper {
|
||||
/// so return fixed return type instead of using `arg_types` to determine the return type.
|
||||
fn return_type(&self, _arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
|
||||
// The return type is the same as the original aggregate function's return type.
|
||||
let ret_type = self.inner.return_type(&self.original_input_types)?;
|
||||
Ok(ret_type)
|
||||
Ok(self.return_type.clone())
|
||||
}
|
||||
fn signature(&self) -> &Signature {
|
||||
&self.merge_signature
|
||||
@@ -541,10 +658,11 @@ impl Accumulator for MergeAccum {
|
||||
})?;
|
||||
let fields = struct_arr.fields();
|
||||
if fields != &self.state_fields {
|
||||
return Err(datafusion_common::DataFusionError::Internal(format!(
|
||||
"Expected state fields: {:?}, got: {:?}",
|
||||
debug!(
|
||||
"State fields mismatch, expected: {:?}, got: {:?}",
|
||||
self.state_fields, fields
|
||||
)));
|
||||
);
|
||||
// state fields mismatch might be acceptable by datafusion, continue
|
||||
}
|
||||
|
||||
// now fields should be the same, so we can merge the batch
|
||||
@@ -561,6 +679,3 @@ impl Accumulator for MergeAccum {
|
||||
self.inner.state()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
189
src/common/function/src/aggrs/aggr_wrapper/fix_order.rs
Normal file
189
src/common/function/src/aggrs/aggr_wrapper/fix_order.rs
Normal file
@@ -0,0 +1,189 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::debug;
|
||||
use datafusion::config::ConfigOptions;
|
||||
use datafusion::optimizer::AnalyzerRule;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
|
||||
use datafusion_expr::{AggregateUDF, Expr, ExprSchemable, LogicalPlan};
|
||||
|
||||
use crate::aggrs::aggr_wrapper::StateWrapper;
|
||||
|
||||
/// Traverse the plan, found all `__<aggr_name>_state` and fix their ordering fields
|
||||
/// if their input aggr is with order by, this is currently only useful for `first_value` and `last_value` udaf
|
||||
///
|
||||
/// should be applied to datanode's query engine
|
||||
/// TODO(discord9): proper way to extend substrait's serde ability to allow carry more info for custom udaf with more info
|
||||
#[derive(Debug, Default)]
|
||||
pub struct FixStateUdafOrderingAnalyzer;
|
||||
|
||||
impl AnalyzerRule for FixStateUdafOrderingAnalyzer {
|
||||
fn name(&self) -> &str {
|
||||
"FixStateUdafOrderingAnalyzer"
|
||||
}
|
||||
|
||||
fn analyze(
|
||||
&self,
|
||||
plan: LogicalPlan,
|
||||
_config: &ConfigOptions,
|
||||
) -> datafusion_common::Result<LogicalPlan> {
|
||||
plan.rewrite_with_subqueries(&mut FixOrderingRewriter::new(true))
|
||||
.map(|t| t.data)
|
||||
}
|
||||
}
|
||||
|
||||
/// Traverse the plan, found all `__<aggr_name>_state` and remove their ordering fields
|
||||
/// this is currently only useful for `first_value` and `last_value` udaf when need to encode to substrait
|
||||
///
|
||||
#[derive(Debug, Default)]
|
||||
pub struct UnFixStateUdafOrderingAnalyzer;
|
||||
|
||||
impl AnalyzerRule for UnFixStateUdafOrderingAnalyzer {
|
||||
fn name(&self) -> &str {
|
||||
"UnFixStateUdafOrderingAnalyzer"
|
||||
}
|
||||
|
||||
fn analyze(
|
||||
&self,
|
||||
plan: LogicalPlan,
|
||||
_config: &ConfigOptions,
|
||||
) -> datafusion_common::Result<LogicalPlan> {
|
||||
plan.rewrite_with_subqueries(&mut FixOrderingRewriter::new(false))
|
||||
.map(|t| t.data)
|
||||
}
|
||||
}
|
||||
|
||||
struct FixOrderingRewriter {
|
||||
/// once fixed, mark dirty, and always recompute schema from bottom up
|
||||
is_dirty: bool,
|
||||
/// if true, will add the ordering field from outer aggr expr
|
||||
/// if false, will remove the ordering field
|
||||
is_fix: bool,
|
||||
}
|
||||
|
||||
impl FixOrderingRewriter {
|
||||
pub fn new(is_fix: bool) -> Self {
|
||||
Self {
|
||||
is_dirty: false,
|
||||
is_fix,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for FixOrderingRewriter {
|
||||
type Node = LogicalPlan;
|
||||
|
||||
/// found all `__<aggr_name>_state` and fix their ordering fields
|
||||
/// if their input aggr is with order by
|
||||
fn f_up(
|
||||
&mut self,
|
||||
node: Self::Node,
|
||||
) -> datafusion_common::Result<datafusion_common::tree_node::Transformed<Self::Node>> {
|
||||
let LogicalPlan::Aggregate(mut aggregate) = node else {
|
||||
return if self.is_dirty {
|
||||
let node = node.recompute_schema()?;
|
||||
Ok(Transformed::yes(node))
|
||||
} else {
|
||||
Ok(Transformed::no(node))
|
||||
};
|
||||
};
|
||||
|
||||
// regex to match state udaf name
|
||||
for aggr_expr in &mut aggregate.aggr_expr {
|
||||
let new_aggr_expr = aggr_expr
|
||||
.clone()
|
||||
.transform_up(|expr| rewrite_expr(expr, &aggregate.input, self.is_fix))?;
|
||||
|
||||
if new_aggr_expr.transformed {
|
||||
*aggr_expr = new_aggr_expr.data;
|
||||
self.is_dirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
if self.is_dirty {
|
||||
let node = LogicalPlan::Aggregate(aggregate).recompute_schema()?;
|
||||
debug!(
|
||||
"FixStateUdafOrderingAnalyzer: plan schema's field changed to {:?}",
|
||||
node.schema().fields()
|
||||
);
|
||||
|
||||
Ok(Transformed::yes(node))
|
||||
} else {
|
||||
Ok(Transformed::no(LogicalPlan::Aggregate(aggregate)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// first see the aggr node in expr
|
||||
/// as it could be nested aggr like alias(aggr(sort))
|
||||
/// if contained aggr expr have a order by, and the aggr name match the regex
|
||||
/// then we need to fix the ordering field of the state udaf
|
||||
/// to be the same as the aggr expr
|
||||
fn rewrite_expr(
|
||||
expr: Expr,
|
||||
aggregate_input: &Arc<LogicalPlan>,
|
||||
is_fix: bool,
|
||||
) -> Result<Transformed<Expr>, datafusion_common::DataFusionError> {
|
||||
let Expr::AggregateFunction(aggregate_function) = expr else {
|
||||
return Ok(Transformed::no(expr));
|
||||
};
|
||||
|
||||
let Some(old_state_wrapper) = aggregate_function
|
||||
.func
|
||||
.inner()
|
||||
.as_any()
|
||||
.downcast_ref::<StateWrapper>()
|
||||
else {
|
||||
return Ok(Transformed::no(Expr::AggregateFunction(aggregate_function)));
|
||||
};
|
||||
|
||||
let mut state_wrapper = old_state_wrapper.clone();
|
||||
if is_fix {
|
||||
// then always fix the ordering field&distinct flag and more
|
||||
let order_by = aggregate_function.params.order_by.clone();
|
||||
let ordering_fields: Vec<_> = order_by
|
||||
.iter()
|
||||
.map(|sort_expr| {
|
||||
sort_expr
|
||||
.expr
|
||||
.to_field(&aggregate_input.schema())
|
||||
.map(|(_, f)| f)
|
||||
})
|
||||
.collect::<datafusion_common::Result<Vec<_>>>()?;
|
||||
let distinct = aggregate_function.params.distinct;
|
||||
|
||||
// fixing up
|
||||
state_wrapper.ordering = ordering_fields;
|
||||
state_wrapper.distinct = distinct;
|
||||
} else {
|
||||
// remove the ordering field & distinct flag
|
||||
state_wrapper.ordering = vec![];
|
||||
state_wrapper.distinct = false;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"FixStateUdafOrderingAnalyzer: fix state udaf from {old_state_wrapper:?} to {:?}",
|
||||
state_wrapper
|
||||
);
|
||||
|
||||
let mut aggregate_function = aggregate_function;
|
||||
|
||||
aggregate_function.func = Arc::new(AggregateUDF::new_from_impl(state_wrapper));
|
||||
|
||||
Ok(Transformed::yes(Expr::AggregateFunction(
|
||||
aggregate_function,
|
||||
)))
|
||||
}
|
||||
@@ -17,13 +17,15 @@ use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
use arrow::array::{ArrayRef, Float64Array, Int64Array, UInt64Array};
|
||||
use arrow::array::{ArrayRef, BooleanArray, Float64Array, Int64Array, UInt64Array};
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use arrow_schema::SchemaRef;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use datafusion::catalog::{Session, TableProvider};
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
|
||||
use datafusion::functions_aggregate::average::avg_udaf;
|
||||
use datafusion::functions_aggregate::count::count_udaf;
|
||||
use datafusion::functions_aggregate::sum::sum_udaf;
|
||||
use datafusion::optimizer::analyzer::type_coercion::TypeCoercion;
|
||||
use datafusion::optimizer::AnalyzerRule;
|
||||
@@ -537,6 +539,208 @@ async fn test_avg_udaf() {
|
||||
assert_eq!(merge_eval_res, ScalarValue::Float64(Some(132. / 45_f64)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_last_value_order_by_udaf() {
|
||||
init_default_ut_logging();
|
||||
let ctx = SessionContext::new();
|
||||
|
||||
let last_value = datafusion::functions_aggregate::first_last::last_value_udaf();
|
||||
let last_value = (*last_value).clone();
|
||||
|
||||
let original_aggr = Aggregate::try_new(
|
||||
Arc::new(dummy_table_scan()),
|
||||
vec![],
|
||||
vec![Expr::AggregateFunction(AggregateFunction::new_udf(
|
||||
Arc::new(last_value.clone()),
|
||||
vec![Expr::Column(Column::new_unqualified("number"))],
|
||||
false,
|
||||
None,
|
||||
vec![datafusion_expr::expr::Sort::new(
|
||||
Expr::Column(Column::new_unqualified("number")),
|
||||
true,
|
||||
true,
|
||||
)],
|
||||
None,
|
||||
))],
|
||||
)
|
||||
.unwrap();
|
||||
let res = StateMergeHelper::split_aggr_node(original_aggr).unwrap();
|
||||
|
||||
let state_func: Arc<AggregateUDF> =
|
||||
Arc::new(StateWrapper::new(last_value.clone()).unwrap().into());
|
||||
|
||||
let expected_aggr_state_plan = LogicalPlan::Aggregate(
|
||||
Aggregate::try_new(
|
||||
Arc::new(dummy_table_scan()),
|
||||
vec![],
|
||||
vec![Expr::AggregateFunction(AggregateFunction::new_udf(
|
||||
state_func,
|
||||
vec![Expr::Column(Column::new_unqualified("number"))],
|
||||
false,
|
||||
None,
|
||||
vec![datafusion_expr::expr::Sort::new(
|
||||
Expr::Column(Column::new_unqualified("number")),
|
||||
true,
|
||||
true,
|
||||
)],
|
||||
None,
|
||||
))],
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
// fix the ordering & distinct info of the state udaf, as they are not set in the wrapper.
|
||||
let fixed_aggr_state_plan = FixStateUdafOrderingAnalyzer {}
|
||||
.analyze(expected_aggr_state_plan.clone(), &Default::default())
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(&res.lower_state, &fixed_aggr_state_plan);
|
||||
|
||||
// schema is the state fields of the last_value udaf
|
||||
assert_eq!(
|
||||
res.lower_state.schema().as_arrow(),
|
||||
&arrow_schema::Schema::new(vec![Field::new(
|
||||
"__last_value_state(number) ORDER BY [number ASC NULLS FIRST]",
|
||||
DataType::Struct(
|
||||
vec![
|
||||
Field::new("last_value[last_value]", DataType::Int64, true),
|
||||
Field::new("number", DataType::Int64, true), // ordering field is added to state fields too
|
||||
Field::new("is_set", DataType::Boolean, true)
|
||||
]
|
||||
.into()
|
||||
),
|
||||
true,
|
||||
)])
|
||||
);
|
||||
|
||||
let expected_merge_fn = MergeWrapper::new(
|
||||
last_value.clone(),
|
||||
Arc::new(
|
||||
AggregateExprBuilder::new(
|
||||
Arc::new(last_value.clone()),
|
||||
vec![Arc::new(
|
||||
datafusion::physical_expr::expressions::Column::new("number", 0),
|
||||
)],
|
||||
)
|
||||
.schema(Arc::new(dummy_table_scan().schema().as_arrow().clone()))
|
||||
.alias("last_value(number) ORDER BY [number ASC NULLS FIRST]")
|
||||
.build()
|
||||
.unwrap(),
|
||||
),
|
||||
vec![DataType::Int64],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let expected_merge_plan = LogicalPlan::Aggregate(
|
||||
Aggregate::try_new(
|
||||
Arc::new(fixed_aggr_state_plan.clone()),
|
||||
vec![],
|
||||
vec![Expr::AggregateFunction(AggregateFunction::new_udf(
|
||||
Arc::new(expected_merge_fn.into()),
|
||||
vec![Expr::Column(Column::new_unqualified(
|
||||
"__last_value_state(number) ORDER BY [number ASC NULLS FIRST]",
|
||||
))],
|
||||
false,
|
||||
None,
|
||||
vec![],
|
||||
None,
|
||||
))
|
||||
.alias("last_value(number) ORDER BY [number ASC NULLS FIRST]")],
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
assert_eq!(&res.upper_merge, &expected_merge_plan);
|
||||
|
||||
let phy_aggr_state_plan = DefaultPhysicalPlanner::default()
|
||||
.create_physical_plan(&fixed_aggr_state_plan, &ctx.state())
|
||||
.await
|
||||
.unwrap();
|
||||
let aggr_exec = phy_aggr_state_plan
|
||||
.as_any()
|
||||
.downcast_ref::<AggregateExec>()
|
||||
.unwrap();
|
||||
let aggr_func_expr = &aggr_exec.aggr_expr()[0];
|
||||
|
||||
let mut state_accum = aggr_func_expr.create_accumulator().unwrap();
|
||||
|
||||
// evaluate the state function
|
||||
let input = Int64Array::from(vec![Some(1), Some(2), None, Some(3)]);
|
||||
let values = vec![Arc::new(input) as arrow::array::ArrayRef];
|
||||
|
||||
state_accum.update_batch(&values).unwrap();
|
||||
|
||||
let state = state_accum.state().unwrap();
|
||||
|
||||
// FIXME(discord9): once datafusion fixes the issue that last_value udaf state fields are not correctly(missing ordering field if `last` field is part of ordering field)
|
||||
// then change it back to 3 fields
|
||||
assert_eq!(state.len(), 2); // last value weird optimization(or maybe bug?) that it only has 2 state fields now
|
||||
assert_eq!(state[0], ScalarValue::Int64(Some(3)));
|
||||
assert_eq!(state[1], ScalarValue::Boolean(Some(true)));
|
||||
|
||||
let eval_res = state_accum.evaluate().unwrap();
|
||||
let expected = Arc::new(
|
||||
StructArray::try_new(
|
||||
vec![
|
||||
Field::new("col_0[mismatch_state]", DataType::Int64, true),
|
||||
Field::new("col_1[mismatch_state]", DataType::Boolean, true),
|
||||
// Field::new("last_value[last_value]", DataType::Int64, true),
|
||||
// Field::new("number", DataType::Int64, true),
|
||||
// Field::new("is_set", DataType::Boolean, true),
|
||||
]
|
||||
.into(),
|
||||
vec![
|
||||
Arc::new(Int64Array::from(vec![Some(3)])),
|
||||
// Arc::new(Int64Array::from(vec![Some(3)])),
|
||||
Arc::new(BooleanArray::from(vec![Some(true)])),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
assert_eq!(eval_res, ScalarValue::Struct(expected));
|
||||
|
||||
let phy_aggr_merge_plan = DefaultPhysicalPlanner::default()
|
||||
.create_physical_plan(&res.upper_merge, &ctx.state())
|
||||
.await
|
||||
.unwrap();
|
||||
let aggr_exec = phy_aggr_merge_plan
|
||||
.as_any()
|
||||
.downcast_ref::<AggregateExec>()
|
||||
.unwrap();
|
||||
let aggr_func_expr = &aggr_exec.aggr_expr()[0];
|
||||
|
||||
let mut merge_accum = aggr_func_expr.create_accumulator().unwrap();
|
||||
|
||||
let merge_input = vec![
|
||||
Arc::new(Int64Array::from(vec![Some(3), Some(4)])) as arrow::array::ArrayRef,
|
||||
Arc::new(Int64Array::from(vec![Some(3), Some(4)])),
|
||||
Arc::new(BooleanArray::from(vec![Some(true), Some(true)])),
|
||||
];
|
||||
let merge_input_struct_arr = StructArray::try_new(
|
||||
vec![
|
||||
Field::new("last_value[last_value]", DataType::Int64, true),
|
||||
Field::new("number", DataType::Int64, true),
|
||||
Field::new("is_set", DataType::Boolean, true),
|
||||
]
|
||||
.into(),
|
||||
merge_input,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
merge_accum
|
||||
.update_batch(&[Arc::new(merge_input_struct_arr)])
|
||||
.unwrap();
|
||||
let merge_state = merge_accum.state().unwrap();
|
||||
assert_eq!(merge_state.len(), 3);
|
||||
assert_eq!(merge_state[0], ScalarValue::Int64(Some(4)));
|
||||
assert_eq!(merge_state[1], ScalarValue::Int64(Some(4)));
|
||||
assert_eq!(merge_state[2], ScalarValue::Boolean(Some(true)));
|
||||
|
||||
let merge_eval_res = merge_accum.evaluate().unwrap();
|
||||
// the merge function returns the last value, which is 4
|
||||
assert_eq!(merge_eval_res, ScalarValue::Int64(Some(4)));
|
||||
}
|
||||
|
||||
/// For testing whether the UDAF state fields are correctly implemented.
|
||||
/// esp. for our own custom UDAF's state fields.
|
||||
/// By compare eval results before and after split to state/merge functions.
|
||||
@@ -548,6 +752,7 @@ async fn test_udaf_correct_eval_result() {
|
||||
input_schema: SchemaRef,
|
||||
input: Vec<ArrayRef>,
|
||||
expected_output: Option<ScalarValue>,
|
||||
// extra check function on the final array result
|
||||
expected_fn: Option<ExpectedFn>,
|
||||
distinct: bool,
|
||||
filter: Option<Box<Expr>>,
|
||||
@@ -578,6 +783,27 @@ async fn test_udaf_correct_eval_result() {
|
||||
order_by: vec![],
|
||||
null_treatment: None,
|
||||
},
|
||||
TestCase {
|
||||
func: count_udaf(),
|
||||
input_schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
|
||||
"str_val",
|
||||
DataType::Utf8,
|
||||
true,
|
||||
)])),
|
||||
args: vec![Expr::Column(Column::new_unqualified("str_val"))],
|
||||
input: vec![Arc::new(StringArray::from(vec![
|
||||
Some("hello"),
|
||||
Some("world"),
|
||||
None,
|
||||
Some("what"),
|
||||
]))],
|
||||
expected_output: Some(ScalarValue::Int64(Some(3))),
|
||||
expected_fn: None,
|
||||
distinct: false,
|
||||
filter: None,
|
||||
order_by: vec![],
|
||||
null_treatment: None,
|
||||
},
|
||||
TestCase {
|
||||
func: avg_udaf(),
|
||||
input_schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
|
||||
|
||||
@@ -280,6 +280,8 @@ fn build_struct(
|
||||
&self,
|
||||
args: datafusion::logical_expr::ScalarFunctionArgs,
|
||||
) -> datafusion_common::Result<datafusion_expr::ColumnarValue> {
|
||||
use common_error::ext::ErrorExt;
|
||||
|
||||
let columns = args.args
|
||||
.iter()
|
||||
.map(|arg| {
|
||||
@@ -293,7 +295,7 @@ fn build_struct(
|
||||
})
|
||||
})
|
||||
.collect::<common_query::error::Result<Vec<_>>>()
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Column conversion error: {}", e)))?;
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Column conversion error: {}", e.output_msg())))?;
|
||||
|
||||
// Safety check: Ensure under the `greptime` catalog for security
|
||||
#user_path::ensure_greptime!(self.func_ctx);
|
||||
@@ -314,14 +316,14 @@ fn build_struct(
|
||||
.#handler
|
||||
.as_ref()
|
||||
.context(#snafu_type)
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Handler error: {}", e)))?;
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Handler error: {}", e.output_msg())))?;
|
||||
|
||||
let mut builder = store_api::storage::ConcreteDataType::#ret()
|
||||
.create_mutable_vector(rows_num);
|
||||
|
||||
if columns_num == 0 {
|
||||
let result = #fn_name(handler, query_ctx, &[]).await
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Function execution error: {}", e)))?;
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Function execution error: {}", e.output_msg())))?;
|
||||
|
||||
builder.push_value_ref(result.as_value_ref());
|
||||
} else {
|
||||
@@ -331,7 +333,7 @@ fn build_struct(
|
||||
.collect();
|
||||
|
||||
let result = #fn_name(handler, query_ctx, &args).await
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Function execution error: {}", e)))?;
|
||||
.map_err(|e| datafusion_common::DataFusionError::Execution(format!("Function execution error: {}", e.output_msg())))?;
|
||||
|
||||
builder.push_value_ref(result.as_value_ref());
|
||||
}
|
||||
|
||||
@@ -108,10 +108,6 @@ pub struct OpenRegion {
|
||||
pub region_wal_options: HashMap<RegionNumber, String>,
|
||||
#[serde(default)]
|
||||
pub skip_wal_replay: bool,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub replay_entry_id: Option<u64>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub metadata_replay_entry_id: Option<u64>,
|
||||
}
|
||||
|
||||
impl OpenRegion {
|
||||
@@ -128,22 +124,8 @@ impl OpenRegion {
|
||||
region_options,
|
||||
region_wal_options,
|
||||
skip_wal_replay,
|
||||
replay_entry_id: None,
|
||||
metadata_replay_entry_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the replay entry id.
|
||||
pub fn with_replay_entry_id(mut self, replay_entry_id: Option<u64>) -> Self {
|
||||
self.replay_entry_id = replay_entry_id;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the metadata replay entry id.
|
||||
pub fn with_metadata_replay_entry_id(mut self, metadata_replay_entry_id: Option<u64>) -> Self {
|
||||
self.metadata_replay_entry_id = metadata_replay_entry_id;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// The instruction of downgrading leader region.
|
||||
@@ -169,7 +151,7 @@ impl Display for DowngradeRegion {
|
||||
}
|
||||
|
||||
/// Upgrades a follower region to leader region.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct UpgradeRegion {
|
||||
/// The [RegionId].
|
||||
pub region_id: RegionId,
|
||||
@@ -186,6 +168,24 @@ pub struct UpgradeRegion {
|
||||
/// The hint for replaying memtable.
|
||||
#[serde(default)]
|
||||
pub location_id: Option<u64>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub replay_entry_id: Option<u64>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub metadata_replay_entry_id: Option<u64>,
|
||||
}
|
||||
|
||||
impl UpgradeRegion {
|
||||
/// Sets the replay entry id.
|
||||
pub fn with_replay_entry_id(mut self, replay_entry_id: Option<u64>) -> Self {
|
||||
self.replay_entry_id = replay_entry_id;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the metadata replay entry id.
|
||||
pub fn with_metadata_replay_entry_id(mut self, metadata_replay_entry_id: Option<u64>) -> Self {
|
||||
self.metadata_replay_entry_id = metadata_replay_entry_id;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
@@ -370,8 +370,6 @@ mod tests {
|
||||
region_options,
|
||||
region_wal_options: HashMap::new(),
|
||||
skip_wal_replay: false,
|
||||
replay_entry_id: None,
|
||||
metadata_replay_entry_id: None,
|
||||
};
|
||||
assert_eq!(expected, deserialized);
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ pub struct TopicRegionValue {
|
||||
pub checkpoint: Option<ReplayCheckpoint>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct ReplayCheckpoint {
|
||||
#[serde(default)]
|
||||
pub entry_id: u64,
|
||||
|
||||
@@ -24,6 +24,7 @@ use datatypes::schema::ColumnSchema;
|
||||
use futures::future::{join_all, try_join_all};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadata};
|
||||
use store_api::storage::consts::ReservedColumnId;
|
||||
use store_api::storage::{RegionId, TableId};
|
||||
use table::metadata::{RawTableInfo, RawTableMeta};
|
||||
use table::table_name::TableName;
|
||||
@@ -384,6 +385,7 @@ pub(crate) fn build_table_meta_from_column_metadatas(
|
||||
|
||||
*next_column_id = column_ids
|
||||
.iter()
|
||||
.filter(|id| !ReservedColumnId::is_reserved(**id))
|
||||
.max()
|
||||
.map(|max| max + 1)
|
||||
.unwrap_or(*next_column_id)
|
||||
@@ -1039,9 +1041,13 @@ mod tests {
|
||||
fn test_build_table_info_from_column_metadatas() {
|
||||
let mut column_metadatas = new_test_column_metadatas();
|
||||
column_metadatas.push(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new("col3", ConcreteDataType::string_datatype(), true),
|
||||
column_schema: ColumnSchema::new(
|
||||
"__table_id",
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
),
|
||||
semantic_type: SemanticType::Tag,
|
||||
column_id: 3,
|
||||
column_id: ReservedColumnId::table_id(),
|
||||
});
|
||||
|
||||
let table_id = 1;
|
||||
@@ -1066,8 +1072,11 @@ mod tests {
|
||||
assert_eq!(new_table_meta.partition_key_indices, vec![2]);
|
||||
assert_eq!(new_table_meta.value_indices, vec![1, 2]);
|
||||
assert_eq!(new_table_meta.schema.timestamp_index, Some(1));
|
||||
assert_eq!(new_table_meta.column_ids, vec![0, 1, 2, 3]);
|
||||
assert_eq!(new_table_meta.next_column_id, 4);
|
||||
assert_eq!(
|
||||
new_table_meta.column_ids,
|
||||
vec![0, 1, 2, ReservedColumnId::table_id()]
|
||||
);
|
||||
assert_eq!(new_table_meta.next_column_id, table_meta.next_column_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -11,6 +11,7 @@ workspace = true
|
||||
async-trait.workspace = true
|
||||
bytes.workspace = true
|
||||
common-error.workspace = true
|
||||
common-function.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
datafusion.workspace = true
|
||||
|
||||
@@ -16,9 +16,13 @@ use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use common_function::aggrs::aggr_wrapper::fix_order::{
|
||||
FixStateUdafOrderingAnalyzer, UnFixStateUdafOrderingAnalyzer,
|
||||
};
|
||||
use datafusion::execution::context::SessionState;
|
||||
use datafusion::execution::runtime_env::RuntimeEnv;
|
||||
use datafusion::execution::SessionStateBuilder;
|
||||
use datafusion::optimizer::AnalyzerRule;
|
||||
use datafusion::prelude::SessionConfig;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
|
||||
@@ -47,6 +51,9 @@ impl SubstraitPlan for DFLogicalSubstraitConvertor {
|
||||
let df_plan = from_substrait_plan(&state, &plan)
|
||||
.await
|
||||
.context(DecodeDfPlanSnafu)?;
|
||||
let df_plan = FixStateUdafOrderingAnalyzer {}
|
||||
.analyze(df_plan, state.config_options())
|
||||
.context(DecodeDfPlanSnafu)?;
|
||||
Ok(df_plan)
|
||||
}
|
||||
|
||||
@@ -55,8 +62,11 @@ impl SubstraitPlan for DFLogicalSubstraitConvertor {
|
||||
plan: &Self::Plan,
|
||||
serializer: impl SerializerRegistry + 'static,
|
||||
) -> Result<Bytes, Self::Error> {
|
||||
let plan = UnFixStateUdafOrderingAnalyzer {}
|
||||
.analyze(plan.clone(), &Default::default())
|
||||
.context(EncodeDfPlanSnafu)?;
|
||||
let mut buf = BytesMut::new();
|
||||
let substrait_plan = self.to_sub_plan(plan, serializer)?;
|
||||
let substrait_plan = self.to_sub_plan(&plan, serializer)?;
|
||||
substrait_plan.encode(&mut buf).context(EncodeRelSnafu)?;
|
||||
|
||||
Ok(buf.freeze())
|
||||
|
||||
@@ -238,10 +238,7 @@ mod tests {
|
||||
// Upgrade region
|
||||
let instruction = Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
replay_timeout: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
});
|
||||
assert!(
|
||||
heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction)))
|
||||
|
||||
@@ -16,7 +16,7 @@ use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
|
||||
use common_meta::wal_options_allocator::prepare_wal_options;
|
||||
use futures_util::future::BoxFuture;
|
||||
use store_api::path_utils::table_dir;
|
||||
use store_api::region_request::{PathType, RegionOpenRequest, RegionRequest, ReplayCheckpoint};
|
||||
use store_api::region_request::{PathType, RegionOpenRequest, RegionRequest};
|
||||
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
|
||||
@@ -29,31 +29,18 @@ impl HandlerContext {
|
||||
mut region_options,
|
||||
region_wal_options,
|
||||
skip_wal_replay,
|
||||
replay_entry_id,
|
||||
metadata_replay_entry_id,
|
||||
}: OpenRegion,
|
||||
) -> BoxFuture<'static, Option<InstructionReply>> {
|
||||
Box::pin(async move {
|
||||
let region_id = Self::region_ident_to_region_id(®ion_ident);
|
||||
prepare_wal_options(&mut region_options, region_id, ®ion_wal_options);
|
||||
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
|
||||
(Some(replay_entry_id), Some(metadata_replay_entry_id)) => Some(ReplayCheckpoint {
|
||||
entry_id: replay_entry_id,
|
||||
metadata_entry_id: Some(metadata_replay_entry_id),
|
||||
}),
|
||||
(Some(replay_entry_id), None) => Some(ReplayCheckpoint {
|
||||
entry_id: replay_entry_id,
|
||||
metadata_entry_id: None,
|
||||
}),
|
||||
_ => None,
|
||||
};
|
||||
let request = RegionRequest::Open(RegionOpenRequest {
|
||||
engine: region_ident.engine,
|
||||
table_dir: table_dir(®ion_storage_path, region_id.table_id()),
|
||||
path_type: PathType::Bare,
|
||||
options: region_options,
|
||||
skip_wal_replay,
|
||||
checkpoint,
|
||||
checkpoint: None,
|
||||
});
|
||||
let result = self.region_server.handle_request(region_id, request).await;
|
||||
let success = result.is_ok();
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply};
|
||||
use common_telemetry::{info, warn};
|
||||
use futures_util::future::BoxFuture;
|
||||
use store_api::region_request::{RegionCatchupRequest, RegionRequest};
|
||||
use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint};
|
||||
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::heartbeat::task_tracker::WaitResult;
|
||||
@@ -29,6 +29,8 @@ impl HandlerContext {
|
||||
metadata_last_entry_id,
|
||||
replay_timeout,
|
||||
location_id,
|
||||
replay_entry_id,
|
||||
metadata_replay_entry_id,
|
||||
}: UpgradeRegion,
|
||||
) -> BoxFuture<'static, Option<InstructionReply>> {
|
||||
Box::pin(async move {
|
||||
@@ -50,6 +52,14 @@ impl HandlerContext {
|
||||
|
||||
let region_server_moved = self.region_server.clone();
|
||||
|
||||
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
|
||||
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
|
||||
entry_id,
|
||||
metadata_entry_id,
|
||||
}),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// The catchup task is almost zero cost if the inside region is writable.
|
||||
// Therefore, it always registers a new catchup task.
|
||||
let register_result = self
|
||||
@@ -66,6 +76,7 @@ impl HandlerContext {
|
||||
entry_id: last_entry_id,
|
||||
metadata_entry_id: metadata_last_entry_id,
|
||||
location_id,
|
||||
checkpoint,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -148,10 +159,8 @@ mod tests {
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
replay_timeout,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
@@ -187,10 +196,8 @@ mod tests {
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
replay_timeout,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
@@ -227,10 +234,8 @@ mod tests {
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
replay_timeout,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
@@ -271,9 +276,7 @@ mod tests {
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
@@ -289,10 +292,8 @@ mod tests {
|
||||
let reply = handler_context
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
replay_timeout: Some(Duration::from_millis(500)),
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
@@ -332,10 +333,7 @@ mod tests {
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
replay_timeout: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
@@ -351,10 +349,8 @@ mod tests {
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
replay_timeout: Some(Duration::from_millis(200)),
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
|
||||
@@ -29,10 +29,15 @@ use common_runtime::JoinHandle;
|
||||
use common_telemetry::tracing::warn;
|
||||
use common_telemetry::{debug, info};
|
||||
use common_time::TimeToLive;
|
||||
use datafusion_common::tree_node::{TreeNodeRecursion, TreeNodeVisitor};
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use query::QueryEngineRef;
|
||||
use session::context::QueryContext;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use sql::parsers::utils::is_tql;
|
||||
use store_api::storage::{RegionId, TableId};
|
||||
use table::table_reference::TableReference;
|
||||
use tokio::sync::{oneshot, RwLock};
|
||||
|
||||
use crate::batching_mode::frontend_client::FrontendClient;
|
||||
@@ -42,8 +47,8 @@ use crate::batching_mode::utils::sql_to_df_plan;
|
||||
use crate::batching_mode::BatchingModeOptions;
|
||||
use crate::engine::FlowEngine;
|
||||
use crate::error::{
|
||||
CreateFlowSnafu, ExternalSnafu, FlowAlreadyExistSnafu, FlowNotFoundSnafu, InvalidQuerySnafu,
|
||||
TableNotFoundMetaSnafu, UnexpectedSnafu, UnsupportedSnafu,
|
||||
CreateFlowSnafu, DatafusionSnafu, ExternalSnafu, FlowAlreadyExistSnafu, FlowNotFoundSnafu,
|
||||
InvalidQuerySnafu, TableNotFoundMetaSnafu, UnexpectedSnafu, UnsupportedSnafu,
|
||||
};
|
||||
use crate::metrics::METRIC_FLOW_BATCHING_ENGINE_BULK_MARK_TIME_WINDOW;
|
||||
use crate::{CreateFlowArgs, Error, FlowId, TableName};
|
||||
@@ -151,9 +156,11 @@ impl BatchingEngine {
|
||||
let handle: JoinHandle<Result<(), Error>> = tokio::spawn(async move {
|
||||
let src_table_names = &task.config.source_table_names;
|
||||
let mut all_dirty_windows = HashSet::new();
|
||||
let mut is_dirty = false;
|
||||
for src_table_name in src_table_names {
|
||||
if let Some((timestamps, unit)) = group_by_table_name.get(src_table_name) {
|
||||
let Some(expr) = &task.config.time_window_expr else {
|
||||
is_dirty = true;
|
||||
continue;
|
||||
};
|
||||
for timestamp in timestamps {
|
||||
@@ -168,6 +175,9 @@ impl BatchingEngine {
|
||||
}
|
||||
}
|
||||
let mut state = task.state.write().unwrap();
|
||||
if is_dirty {
|
||||
state.dirty_time_windows.set_dirty();
|
||||
}
|
||||
let flow_id_label = task.config.flow_id.to_string();
|
||||
for timestamp in all_dirty_windows {
|
||||
state.dirty_time_windows.add_window(timestamp, None);
|
||||
@@ -269,9 +279,12 @@ impl BatchingEngine {
|
||||
let handle: JoinHandle<Result<(), Error>> = tokio::spawn(async move {
|
||||
let src_table_names = &task.config.source_table_names;
|
||||
|
||||
let mut is_dirty = false;
|
||||
|
||||
for src_table_name in src_table_names {
|
||||
if let Some(entry) = group_by_table_name.get(src_table_name) {
|
||||
let Some(expr) = &task.config.time_window_expr else {
|
||||
is_dirty = true;
|
||||
continue;
|
||||
};
|
||||
let involved_time_windows = expr.handle_rows(entry.clone()).await?;
|
||||
@@ -281,6 +294,10 @@ impl BatchingEngine {
|
||||
.add_lower_bounds(involved_time_windows.into_iter());
|
||||
}
|
||||
}
|
||||
if is_dirty {
|
||||
task.state.write().unwrap().dirty_time_windows.set_dirty();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
});
|
||||
handles.push(handle);
|
||||
@@ -370,13 +387,12 @@ impl BatchingEngine {
|
||||
}
|
||||
})?;
|
||||
let query_ctx = Arc::new(query_ctx);
|
||||
let is_tql = is_tql(query_ctx.sql_dialect(), &sql)
|
||||
.map_err(BoxedError::new)
|
||||
.context(CreateFlowSnafu { sql: &sql })?;
|
||||
|
||||
// optionally set a eval interval for the flow
|
||||
if eval_interval.is_none()
|
||||
&& is_tql(query_ctx.sql_dialect(), &sql)
|
||||
.map_err(BoxedError::new)
|
||||
.context(CreateFlowSnafu { sql: &sql })?
|
||||
{
|
||||
if eval_interval.is_none() && is_tql {
|
||||
InvalidQuerySnafu {
|
||||
reason: "TQL query requires EVAL INTERVAL to be set".to_string(),
|
||||
}
|
||||
@@ -418,6 +434,11 @@ impl BatchingEngine {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
let plan = sql_to_df_plan(query_ctx.clone(), self.query_engine.clone(), &sql, true).await?;
|
||||
|
||||
if is_tql {
|
||||
self.check_is_tql_table(&plan, &query_ctx).await?;
|
||||
}
|
||||
|
||||
let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
|
||||
&plan,
|
||||
self.query_engine.engine_state().catalog_manager().clone(),
|
||||
@@ -484,6 +505,131 @@ impl BatchingEngine {
|
||||
Ok(Some(flow_id))
|
||||
}
|
||||
|
||||
async fn check_is_tql_table(
|
||||
&self,
|
||||
query: &LogicalPlan,
|
||||
query_ctx: &QueryContext,
|
||||
) -> Result<(), Error> {
|
||||
struct CollectTableRef {
|
||||
table_refs: HashSet<datafusion_common::TableReference>,
|
||||
}
|
||||
|
||||
impl TreeNodeVisitor<'_> for CollectTableRef {
|
||||
type Node = LogicalPlan;
|
||||
fn f_down(
|
||||
&mut self,
|
||||
node: &Self::Node,
|
||||
) -> datafusion_common::Result<TreeNodeRecursion> {
|
||||
if let LogicalPlan::TableScan(scan) = node {
|
||||
self.table_refs.insert(scan.table_name.clone());
|
||||
}
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
}
|
||||
}
|
||||
let mut table_refs = CollectTableRef {
|
||||
table_refs: HashSet::new(),
|
||||
};
|
||||
query
|
||||
.visit_with_subqueries(&mut table_refs)
|
||||
.context(DatafusionSnafu {
|
||||
context: "Checking if all source tables are TQL tables",
|
||||
})?;
|
||||
|
||||
let default_catalog = query_ctx.current_catalog();
|
||||
let default_schema = query_ctx.current_schema();
|
||||
let default_schema = &default_schema;
|
||||
|
||||
for table_ref in table_refs.table_refs {
|
||||
let table_ref = match &table_ref {
|
||||
datafusion_common::TableReference::Bare { table } => {
|
||||
TableReference::full(default_catalog, default_schema, table)
|
||||
}
|
||||
datafusion_common::TableReference::Partial { schema, table } => {
|
||||
TableReference::full(default_catalog, schema, table)
|
||||
}
|
||||
datafusion_common::TableReference::Full {
|
||||
catalog,
|
||||
schema,
|
||||
table,
|
||||
} => TableReference::full(catalog, schema, table),
|
||||
};
|
||||
|
||||
let table_id = self
|
||||
.table_meta
|
||||
.table_name_manager()
|
||||
.get(table_ref.into())
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
reason: format!("Failed to get table id for table: {}", table_ref),
|
||||
})?
|
||||
.table_id();
|
||||
let table_info =
|
||||
get_table_info(self.table_meta.table_info_manager(), &table_id).await?;
|
||||
// first check if it's only one f64 value column
|
||||
let value_cols = table_info
|
||||
.table_info
|
||||
.meta
|
||||
.schema
|
||||
.column_schemas
|
||||
.iter()
|
||||
.filter(|col| col.data_type == ConcreteDataType::float64_datatype())
|
||||
.collect::<Vec<_>>();
|
||||
ensure!(
|
||||
value_cols.len() == 1,
|
||||
InvalidQuerySnafu {
|
||||
reason: format!(
|
||||
"TQL query only supports one f64 value column, table `{}`(id={}) has {} f64 value columns, columns are: {:?}",
|
||||
table_ref,
|
||||
table_id,
|
||||
value_cols.len(),
|
||||
value_cols
|
||||
),
|
||||
}
|
||||
);
|
||||
// TODO(discord9): do need to check rest columns is string and is tag column?
|
||||
let pk_idxs = table_info
|
||||
.table_info
|
||||
.meta
|
||||
.primary_key_indices
|
||||
.iter()
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
for (idx, col) in table_info
|
||||
.table_info
|
||||
.meta
|
||||
.schema
|
||||
.column_schemas
|
||||
.iter()
|
||||
.enumerate()
|
||||
{
|
||||
// three cases:
|
||||
// 1. val column
|
||||
// 2. timestamp column
|
||||
// 3. tag column (string)
|
||||
|
||||
let is_pk: bool = pk_idxs.contains(&&idx);
|
||||
|
||||
ensure!(
|
||||
col.data_type == ConcreteDataType::float64_datatype()
|
||||
|| col.data_type.is_timestamp()
|
||||
|| (col.data_type == ConcreteDataType::string_datatype() && is_pk),
|
||||
InvalidQuerySnafu {
|
||||
reason: format!(
|
||||
"TQL query only supports f64 value column, timestamp column and string tag columns, table `{}`(id={}) has column `{}` with type {:?} which is not supported",
|
||||
table_ref,
|
||||
table_id,
|
||||
col.name,
|
||||
col.data_type
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn remove_flow_inner(&self, flow_id: FlowId) -> Result<(), Error> {
|
||||
if self.tasks.write().await.remove(&flow_id).is_none() {
|
||||
warn!("Flow {flow_id} not found in tasks");
|
||||
|
||||
@@ -203,11 +203,21 @@ impl DirtyTimeWindows {
|
||||
self.windows.clear();
|
||||
}
|
||||
|
||||
/// Set windows to be dirty, only useful for full aggr without time window
|
||||
/// to mark some new data is inserted
|
||||
pub fn set_dirty(&mut self) {
|
||||
self.windows.insert(Timestamp::new_second(0), None);
|
||||
}
|
||||
|
||||
/// Number of dirty windows.
|
||||
pub fn len(&self) -> usize {
|
||||
self.windows.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.windows.is_empty()
|
||||
}
|
||||
|
||||
/// Get the effective count of time windows, which is the number of time windows that can be
|
||||
/// used for query, compute from total time window range divided by `window_size`.
|
||||
pub fn effective_count(&self, window_size: &Duration) -> usize {
|
||||
|
||||
@@ -48,8 +48,8 @@ use crate::batching_mode::frontend_client::FrontendClient;
|
||||
use crate::batching_mode::state::{FilterExprInfo, TaskState};
|
||||
use crate::batching_mode::time_window::TimeWindowExpr;
|
||||
use crate::batching_mode::utils::{
|
||||
get_table_info_df_schema, sql_to_df_plan, AddAutoColumnRewriter, AddFilterRewriter,
|
||||
FindGroupByFinalName,
|
||||
gen_plan_with_matching_schema, get_table_info_df_schema, sql_to_df_plan, AddFilterRewriter,
|
||||
ColumnMatcherRewriter, FindGroupByFinalName,
|
||||
};
|
||||
use crate::batching_mode::BatchingModeOptions;
|
||||
use crate::df_optimizer::apply_df_optimizer;
|
||||
@@ -618,42 +618,63 @@ impl BatchingTask {
|
||||
.map(|expr| expr.eval(low_bound))
|
||||
.transpose()?;
|
||||
|
||||
let (Some((Some(l), Some(u))), QueryType::Sql) =
|
||||
(expire_time_window_bound, &self.config.query_type)
|
||||
else {
|
||||
// either no time window or not a sql query, then just use the original query
|
||||
// use sink_table_meta to add to query the `update_at` and `__ts_placeholder` column's value too for compatibility reason
|
||||
debug!(
|
||||
"Flow id = {:?}, can't get window size: precise_lower_bound={expire_time_window_bound:?}, using the same query", self.config.flow_id
|
||||
);
|
||||
// clean dirty time window too, this could be from create flow's check_execute
|
||||
self.state.write().unwrap().dirty_time_windows.clean();
|
||||
let (expire_lower_bound, expire_upper_bound) =
|
||||
match (expire_time_window_bound, &self.config.query_type) {
|
||||
(Some((Some(l), Some(u))), QueryType::Sql) => (l, u),
|
||||
(None, QueryType::Sql) => {
|
||||
// if it's sql query and no time window lower/upper bound is found, just return the original query(with auto columns)
|
||||
// use sink_table_meta to add to query the `update_at` and `__ts_placeholder` column's value too for compatibility reason
|
||||
debug!(
|
||||
"Flow id = {:?}, no time window, using the same query",
|
||||
self.config.flow_id
|
||||
);
|
||||
// clean dirty time window too, this could be from create flow's check_execute
|
||||
let is_dirty = !self.state.read().unwrap().dirty_time_windows.is_empty();
|
||||
self.state.write().unwrap().dirty_time_windows.clean();
|
||||
|
||||
// TODO(discord9): not add auto column for tql query?
|
||||
let mut add_auto_column = AddAutoColumnRewriter::new(sink_table_schema.clone());
|
||||
if !is_dirty {
|
||||
// no dirty data, hence no need to update
|
||||
debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let plan = sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, false)
|
||||
.await?;
|
||||
let plan = gen_plan_with_matching_schema(
|
||||
&self.config.query,
|
||||
query_ctx,
|
||||
engine,
|
||||
sink_table_schema.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let plan = plan
|
||||
.clone()
|
||||
.rewrite(&mut add_auto_column)
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!("Failed to rewrite plan:\n {}\n", plan),
|
||||
})?
|
||||
.data;
|
||||
return Ok(Some(PlanInfo { plan, filter: None }));
|
||||
}
|
||||
_ => {
|
||||
// clean for tql have no use for time window
|
||||
self.state.write().unwrap().dirty_time_windows.clean();
|
||||
|
||||
// since no time window lower/upper bound is found, just return the original query(with auto columns)
|
||||
return Ok(Some(PlanInfo { plan, filter: None }));
|
||||
};
|
||||
let plan = gen_plan_with_matching_schema(
|
||||
&self.config.query,
|
||||
query_ctx,
|
||||
engine,
|
||||
sink_table_schema.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
return Ok(Some(PlanInfo { plan, filter: None }));
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Flow id = {:?}, found time window: precise_lower_bound={:?}, precise_upper_bound={:?} with dirty time windows: {:?}",
|
||||
self.config.flow_id, l, u, self.state.read().unwrap().dirty_time_windows
|
||||
self.config.flow_id, expire_lower_bound, expire_upper_bound, self.state.read().unwrap().dirty_time_windows
|
||||
);
|
||||
let window_size = u.sub(&l).with_context(|| UnexpectedSnafu {
|
||||
reason: format!("Can't get window size from {u:?} - {l:?}"),
|
||||
})?;
|
||||
let window_size = expire_upper_bound
|
||||
.sub(&expire_lower_bound)
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Can't get window size from {expire_upper_bound:?} - {expire_lower_bound:?}"
|
||||
),
|
||||
})?;
|
||||
let col_name = self
|
||||
.config
|
||||
.time_window_expr
|
||||
@@ -673,7 +694,7 @@ impl BatchingTask {
|
||||
.dirty_time_windows
|
||||
.gen_filter_exprs(
|
||||
&col_name,
|
||||
Some(l),
|
||||
Some(expire_lower_bound),
|
||||
window_size,
|
||||
max_window_cnt
|
||||
.unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query),
|
||||
@@ -701,7 +722,7 @@ impl BatchingTask {
|
||||
};
|
||||
|
||||
let mut add_filter = AddFilterRewriter::new(expr.expr.clone());
|
||||
let mut add_auto_column = AddAutoColumnRewriter::new(sink_table_schema.clone());
|
||||
let mut add_auto_column = ColumnMatcherRewriter::new(sink_table_schema.clone());
|
||||
|
||||
let plan =
|
||||
sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, false).await?;
|
||||
@@ -732,7 +753,25 @@ fn create_table_with_expr(
|
||||
sink_table_name: &[String; 3],
|
||||
query_type: &QueryType,
|
||||
) -> Result<CreateTableExpr, Error> {
|
||||
let (first_time_stamp, primary_keys) = build_primary_key_constraint(plan)?;
|
||||
let table_def = match query_type {
|
||||
&QueryType::Sql => {
|
||||
if let Some(def) = build_pk_from_aggr(plan)? {
|
||||
def
|
||||
} else {
|
||||
build_by_sql_schema(plan)?
|
||||
}
|
||||
}
|
||||
QueryType::Tql => {
|
||||
// first try build from aggr, then from tql schema because tql query might not have aggr node
|
||||
if let Some(table_def) = build_pk_from_aggr(plan)? {
|
||||
table_def
|
||||
} else {
|
||||
build_by_tql_schema(plan)?
|
||||
}
|
||||
}
|
||||
};
|
||||
let first_time_stamp = table_def.ts_col;
|
||||
let primary_keys = table_def.pks;
|
||||
|
||||
let mut column_schemas = Vec::new();
|
||||
for field in plan.schema().fields() {
|
||||
@@ -755,7 +794,7 @@ fn create_table_with_expr(
|
||||
let is_val_column = !is_tag_column && first_time_stamp.as_ref() != Some(name);
|
||||
if is_val_column {
|
||||
let col_schema =
|
||||
ColumnSchema::new("val", ConcreteDataType::float64_datatype(), true);
|
||||
ColumnSchema::new(name, ConcreteDataType::float64_datatype(), true);
|
||||
column_schemas.push(col_schema);
|
||||
} else if is_tag_column {
|
||||
let col_schema =
|
||||
@@ -809,15 +848,63 @@ fn create_table_with_expr(
|
||||
})
|
||||
}
|
||||
|
||||
/// simply build by schema, return first timestamp column and no primary key
|
||||
fn build_by_sql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
|
||||
let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
|
||||
if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
|
||||
Some(f.name().clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
Ok(TableDef {
|
||||
ts_col: first_time_stamp,
|
||||
pks: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Return first timestamp column found in output schema and all string columns
|
||||
fn build_by_tql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
|
||||
let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
|
||||
if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
|
||||
Some(f.name().clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
let string_columns = plan
|
||||
.schema()
|
||||
.fields()
|
||||
.iter()
|
||||
.filter_map(|f| {
|
||||
if ConcreteDataType::from_arrow_type(f.data_type()).is_string() {
|
||||
Some(f.name().clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
Ok(TableDef {
|
||||
ts_col: first_time_stamp,
|
||||
pks: string_columns,
|
||||
})
|
||||
}
|
||||
|
||||
struct TableDef {
|
||||
ts_col: Option<String>,
|
||||
pks: Vec<String>,
|
||||
}
|
||||
|
||||
/// Return first timestamp column which is in group by clause and other columns which are also in group by clause
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `Option<String>` - first timestamp column which is in group by clause
|
||||
/// * `Vec<String>` - other columns which are also in group by clause
|
||||
fn build_primary_key_constraint(
|
||||
plan: &LogicalPlan,
|
||||
) -> Result<(Option<String>, Vec<String>), Error> {
|
||||
///
|
||||
/// if no aggregation found, return None
|
||||
fn build_pk_from_aggr(plan: &LogicalPlan) -> Result<Option<TableDef>, Error> {
|
||||
let fields = plan.schema().fields();
|
||||
let mut pk_names = FindGroupByFinalName::default();
|
||||
|
||||
@@ -827,13 +914,18 @@ fn build_primary_key_constraint(
|
||||
})?;
|
||||
|
||||
// if no group by clause, return empty with first timestamp column found in output schema
|
||||
let pk_final_names = pk_names.get_group_expr_names().unwrap_or_default();
|
||||
let Some(pk_final_names) = pk_names.get_group_expr_names() else {
|
||||
return Ok(None);
|
||||
};
|
||||
if pk_final_names.is_empty() {
|
||||
let first_ts_col = fields
|
||||
.iter()
|
||||
.find(|f| ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp())
|
||||
.map(|f| f.name().clone());
|
||||
return Ok((first_ts_col, Vec::new()));
|
||||
return Ok(Some(TableDef {
|
||||
ts_col: first_ts_col,
|
||||
pks: vec![],
|
||||
}));
|
||||
}
|
||||
|
||||
let all_pk_cols: Vec<_> = fields
|
||||
@@ -855,7 +947,10 @@ fn build_primary_key_constraint(
|
||||
.filter(|col| first_time_stamp != Some(col.to_string()))
|
||||
.collect();
|
||||
|
||||
Ok((first_time_stamp, all_pk_cols))
|
||||
Ok(Some(TableDef {
|
||||
ts_col: first_time_stamp,
|
||||
pks: all_pk_cols,
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -24,7 +24,7 @@ use datafusion::error::Result as DfResult;
|
||||
use datafusion::logical_expr::Expr;
|
||||
use datafusion::sql::unparser::Unparser;
|
||||
use datafusion_common::tree_node::{
|
||||
Transformed, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor,
|
||||
Transformed, TreeNode as _, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor,
|
||||
};
|
||||
use datafusion_common::{DFSchema, DataFusionError, ScalarValue};
|
||||
use datafusion_expr::{Distinct, LogicalPlan, Projection};
|
||||
@@ -135,6 +135,27 @@ pub async fn sql_to_df_plan(
|
||||
Ok(plan)
|
||||
}
|
||||
|
||||
/// Generate a plan that matches the schema of the sink table
|
||||
/// from given sql by alias and adding auto columns
|
||||
pub(crate) async fn gen_plan_with_matching_schema(
|
||||
sql: &str,
|
||||
query_ctx: QueryContextRef,
|
||||
engine: QueryEngineRef,
|
||||
sink_table_schema: SchemaRef,
|
||||
) -> Result<LogicalPlan, Error> {
|
||||
let plan = sql_to_df_plan(query_ctx.clone(), engine.clone(), sql, false).await?;
|
||||
|
||||
let mut add_auto_column = ColumnMatcherRewriter::new(sink_table_schema);
|
||||
let plan = plan
|
||||
.clone()
|
||||
.rewrite(&mut add_auto_column)
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!("Failed to rewrite plan:\n {}\n", plan),
|
||||
})?
|
||||
.data;
|
||||
Ok(plan)
|
||||
}
|
||||
|
||||
pub fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
|
||||
/// A dialect that forces identifiers to be quoted when have uppercase
|
||||
struct ForceQuoteIdentifiers;
|
||||
@@ -239,19 +260,19 @@ impl TreeNodeVisitor<'_> for FindGroupByFinalName {
|
||||
}
|
||||
}
|
||||
|
||||
/// Add to the final select columns like `update_at`
|
||||
/// Optionally add to the final select columns like `update_at` if the sink table has such column
|
||||
/// (which doesn't necessary need to have exact name just need to be a extra timestamp column)
|
||||
/// and `__ts_placeholder`(this column need to have exact this name and be a timestamp)
|
||||
/// with values like `now()` and `0`
|
||||
///
|
||||
/// it also give existing columns alias to column in sink table if needed
|
||||
#[derive(Debug)]
|
||||
pub struct AddAutoColumnRewriter {
|
||||
pub struct ColumnMatcherRewriter {
|
||||
pub schema: SchemaRef,
|
||||
pub is_rewritten: bool,
|
||||
}
|
||||
|
||||
impl AddAutoColumnRewriter {
|
||||
impl ColumnMatcherRewriter {
|
||||
pub fn new(schema: SchemaRef) -> Self {
|
||||
Self {
|
||||
schema,
|
||||
@@ -348,7 +369,7 @@ impl AddAutoColumnRewriter {
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for AddAutoColumnRewriter {
|
||||
impl TreeNodeRewriter for ColumnMatcherRewriter {
|
||||
type Node = LogicalPlan;
|
||||
fn f_down(&mut self, mut node: Self::Node) -> DfResult<Transformed<Self::Node>> {
|
||||
if self.is_rewritten {
|
||||
@@ -696,7 +717,7 @@ mod test {
|
||||
let ctx = QueryContext::arc();
|
||||
for (before, after, column_schemas) in testcases {
|
||||
let schema = Arc::new(Schema::new(column_schemas));
|
||||
let mut add_auto_column_rewriter = AddAutoColumnRewriter::new(schema);
|
||||
let mut add_auto_column_rewriter = ColumnMatcherRewriter::new(schema);
|
||||
|
||||
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), before, false)
|
||||
.await
|
||||
|
||||
@@ -376,34 +376,16 @@ impl Instance {
|
||||
ctx: QueryContextRef,
|
||||
) -> server_error::Result<bool> {
|
||||
let db_string = ctx.get_db_string();
|
||||
// fast cache check
|
||||
let cache = self
|
||||
.otlp_metrics_table_legacy_cache
|
||||
.entry(db_string)
|
||||
.entry(db_string.clone())
|
||||
.or_default();
|
||||
|
||||
// check cache
|
||||
let hit_cache = names
|
||||
.iter()
|
||||
.filter_map(|name| cache.get(*name))
|
||||
.collect::<Vec<_>>();
|
||||
if !hit_cache.is_empty() {
|
||||
let hit_legacy = hit_cache.iter().any(|en| *en.value());
|
||||
let hit_prom = hit_cache.iter().any(|en| !*en.value());
|
||||
|
||||
// hit but have true and false, means both legacy and new mode are used
|
||||
// we cannot handle this case, so return error
|
||||
// add doc links in err msg later
|
||||
ensure!(!(hit_legacy && hit_prom), OtlpMetricModeIncompatibleSnafu);
|
||||
|
||||
let flag = hit_legacy;
|
||||
// set cache for all names
|
||||
names.iter().for_each(|name| {
|
||||
if !cache.contains_key(*name) {
|
||||
cache.insert(name.to_string(), flag);
|
||||
}
|
||||
});
|
||||
if let Some(flag) = fast_legacy_check(&cache, names)? {
|
||||
return Ok(flag);
|
||||
}
|
||||
// release cache reference to avoid lock contention
|
||||
drop(cache);
|
||||
|
||||
let catalog = ctx.current_catalog();
|
||||
let schema = ctx.current_schema();
|
||||
@@ -430,7 +412,10 @@ impl Instance {
|
||||
|
||||
// means no existing table is found, use new mode
|
||||
if table_ids.is_empty() {
|
||||
// set cache
|
||||
let cache = self
|
||||
.otlp_metrics_table_legacy_cache
|
||||
.entry(db_string)
|
||||
.or_default();
|
||||
names.iter().for_each(|name| {
|
||||
cache.insert(name.to_string(), false);
|
||||
});
|
||||
@@ -455,6 +440,10 @@ impl Instance {
|
||||
.unwrap_or(&OTLP_LEGACY_DEFAULT_VALUE)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let cache = self
|
||||
.otlp_metrics_table_legacy_cache
|
||||
.entry(db_string)
|
||||
.or_default();
|
||||
if !options.is_empty() {
|
||||
// check value consistency
|
||||
let has_prom = options.iter().any(|opt| *opt == OTLP_METRIC_COMPAT_PROM);
|
||||
@@ -477,6 +466,39 @@ impl Instance {
|
||||
}
|
||||
}
|
||||
|
||||
fn fast_legacy_check(
|
||||
cache: &DashMap<String, bool>,
|
||||
names: &[&String],
|
||||
) -> server_error::Result<Option<bool>> {
|
||||
let hit_cache = names
|
||||
.iter()
|
||||
.filter_map(|name| cache.get(*name))
|
||||
.collect::<Vec<_>>();
|
||||
if !hit_cache.is_empty() {
|
||||
let hit_legacy = hit_cache.iter().any(|en| *en.value());
|
||||
let hit_prom = hit_cache.iter().any(|en| !*en.value());
|
||||
|
||||
// hit but have true and false, means both legacy and new mode are used
|
||||
// we cannot handle this case, so return error
|
||||
// add doc links in err msg later
|
||||
ensure!(!(hit_legacy && hit_prom), OtlpMetricModeIncompatibleSnafu);
|
||||
|
||||
let flag = hit_legacy;
|
||||
// drop hit_cache to release references before inserting to avoid deadlock
|
||||
drop(hit_cache);
|
||||
|
||||
// set cache for all names
|
||||
names.iter().for_each(|name| {
|
||||
if !cache.contains_key(*name) {
|
||||
cache.insert(name.to_string(), flag);
|
||||
}
|
||||
});
|
||||
Ok(Some(flag))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// If the relevant variables are set, the timeout is enforced for all PostgreSQL statements.
|
||||
/// For MySQL, it applies only to read-only statements.
|
||||
fn derive_timeout(stmt: &Statement, query_ctx: &QueryContextRef) -> Option<Duration> {
|
||||
@@ -1039,6 +1061,10 @@ fn should_capture_statement(stmt: Option<&Statement>) -> bool {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Barrier};
|
||||
use std::thread;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_base::Plugins;
|
||||
use query::query_engine::options::QueryOptions;
|
||||
@@ -1048,6 +1074,122 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_fast_legacy_check_deadlock_prevention() {
|
||||
// Create a DashMap to simulate the cache
|
||||
let cache = DashMap::new();
|
||||
|
||||
// Pre-populate cache with some entries
|
||||
cache.insert("metric1".to_string(), true); // legacy mode
|
||||
cache.insert("metric2".to_string(), false); // prom mode
|
||||
cache.insert("metric3".to_string(), true); // legacy mode
|
||||
|
||||
// Test case 1: Normal operation with cache hits
|
||||
let metric1 = "metric1".to_string();
|
||||
let metric4 = "metric4".to_string();
|
||||
let names1 = vec![&metric1, &metric4];
|
||||
let result = fast_legacy_check(&cache, &names1);
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap(), Some(true)); // should return legacy mode
|
||||
|
||||
// Verify that metric4 was added to cache
|
||||
assert!(cache.contains_key("metric4"));
|
||||
assert!(*cache.get("metric4").unwrap().value());
|
||||
|
||||
// Test case 2: No cache hits
|
||||
let metric5 = "metric5".to_string();
|
||||
let metric6 = "metric6".to_string();
|
||||
let names2 = vec![&metric5, &metric6];
|
||||
let result = fast_legacy_check(&cache, &names2);
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap(), None); // should return None as no cache hits
|
||||
|
||||
// Test case 3: Incompatible modes should return error
|
||||
let cache_incompatible = DashMap::new();
|
||||
cache_incompatible.insert("metric1".to_string(), true); // legacy
|
||||
cache_incompatible.insert("metric2".to_string(), false); // prom
|
||||
let metric1_test = "metric1".to_string();
|
||||
let metric2_test = "metric2".to_string();
|
||||
let names3 = vec![&metric1_test, &metric2_test];
|
||||
let result = fast_legacy_check(&cache_incompatible, &names3);
|
||||
assert!(result.is_err()); // should error due to incompatible modes
|
||||
|
||||
// Test case 4: Intensive concurrent access to test deadlock prevention
|
||||
// This test specifically targets the scenario where multiple threads
|
||||
// access the same cache entries simultaneously
|
||||
let cache_concurrent = Arc::new(DashMap::new());
|
||||
cache_concurrent.insert("shared_metric".to_string(), true);
|
||||
|
||||
let num_threads = 8;
|
||||
let operations_per_thread = 100;
|
||||
let barrier = Arc::new(Barrier::new(num_threads));
|
||||
let success_flag = Arc::new(AtomicBool::new(true));
|
||||
|
||||
let handles: Vec<_> = (0..num_threads)
|
||||
.map(|thread_id| {
|
||||
let cache_clone = Arc::clone(&cache_concurrent);
|
||||
let barrier_clone = Arc::clone(&barrier);
|
||||
let success_flag_clone = Arc::clone(&success_flag);
|
||||
|
||||
thread::spawn(move || {
|
||||
// Wait for all threads to be ready
|
||||
barrier_clone.wait();
|
||||
|
||||
let start_time = Instant::now();
|
||||
for i in 0..operations_per_thread {
|
||||
// Each operation references existing cache entry and adds new ones
|
||||
let shared_metric = "shared_metric".to_string();
|
||||
let new_metric = format!("thread_{}_metric_{}", thread_id, i);
|
||||
let names = vec![&shared_metric, &new_metric];
|
||||
|
||||
match fast_legacy_check(&cache_clone, &names) {
|
||||
Ok(_) => {}
|
||||
Err(_) => {
|
||||
success_flag_clone.store(false, Ordering::Relaxed);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// If the test takes too long, it likely means deadlock
|
||||
if start_time.elapsed() > Duration::from_secs(10) {
|
||||
success_flag_clone.store(false, Ordering::Relaxed);
|
||||
return;
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Join all threads with timeout
|
||||
let start_time = Instant::now();
|
||||
for (i, handle) in handles.into_iter().enumerate() {
|
||||
let join_result = handle.join();
|
||||
|
||||
// Check if we're taking too long (potential deadlock)
|
||||
if start_time.elapsed() > Duration::from_secs(30) {
|
||||
panic!("Test timed out - possible deadlock detected!");
|
||||
}
|
||||
|
||||
if join_result.is_err() {
|
||||
panic!("Thread {} panicked during execution", i);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify all operations completed successfully
|
||||
assert!(
|
||||
success_flag.load(Ordering::Relaxed),
|
||||
"Some operations failed"
|
||||
);
|
||||
|
||||
// Verify that many new entries were added (proving operations completed)
|
||||
let final_count = cache_concurrent.len();
|
||||
assert!(
|
||||
final_count > 1 + num_threads * operations_per_thread / 2,
|
||||
"Expected more cache entries, got {}",
|
||||
final_count
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exec_validation() {
|
||||
let query_ctx = QueryContext::arc();
|
||||
|
||||
@@ -302,6 +302,10 @@ impl LogStore for KafkaLogStore {
|
||||
},
|
||||
))
|
||||
.await?;
|
||||
debug!(
|
||||
"Appended batch to Kafka, region_grouped_max_offset: {:?}",
|
||||
region_grouped_max_offset
|
||||
);
|
||||
|
||||
Ok(AppendBatchResponse {
|
||||
last_entry_ids: region_grouped_max_offset.into_iter().collect(),
|
||||
@@ -362,6 +366,17 @@ impl LogStore for KafkaLogStore {
|
||||
.context(GetOffsetSnafu {
|
||||
topic: &provider.topic,
|
||||
})?;
|
||||
let latest_offset = (end_offset as u64).saturating_sub(1);
|
||||
self.topic_stats
|
||||
.entry(provider.clone())
|
||||
.and_modify(|stat| {
|
||||
stat.latest_offset = stat.latest_offset.max(latest_offset);
|
||||
})
|
||||
.or_insert_with(|| TopicStat {
|
||||
latest_offset,
|
||||
record_size: 0,
|
||||
record_num: 0,
|
||||
});
|
||||
|
||||
let region_indexes = if let (Some(index), Some(collector)) =
|
||||
(index, self.client_manager.global_index_collector())
|
||||
@@ -550,6 +565,7 @@ mod tests {
|
||||
use futures::TryStreamExt;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::Rng;
|
||||
use rskafka::client::partition::OffsetAt;
|
||||
use store_api::logstore::entry::{Entry, MultiplePartEntry, MultiplePartHeader, NaiveEntry};
|
||||
use store_api::logstore::provider::Provider;
|
||||
use store_api::logstore::LogStore;
|
||||
@@ -713,8 +729,16 @@ mod tests {
|
||||
.for_each(|entry| entry.set_entry_id(0));
|
||||
assert_eq!(expected_entries, actual_entries);
|
||||
}
|
||||
let high_wathermark = logstore.latest_entry_id(&provider).unwrap();
|
||||
assert_eq!(high_wathermark, 99);
|
||||
let latest_entry_id = logstore.latest_entry_id(&provider).unwrap();
|
||||
let client = logstore
|
||||
.client_manager
|
||||
.get_or_insert(provider.as_kafka_provider().unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(latest_entry_id, 99);
|
||||
// The latest offset is the offset of the last record plus one.
|
||||
let latest = client.client().get_offset(OffsetAt::Latest).await.unwrap();
|
||||
assert_eq!(latest, 100);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -112,11 +112,11 @@ mod tests {
|
||||
let current_latest_offset = topic_stats.get(&provider).unwrap().latest_offset;
|
||||
assert_eq!(current_latest_offset, 0);
|
||||
|
||||
let record = vec![record()];
|
||||
let record = vec![record(), record()];
|
||||
let region = RegionId::new(1, 1);
|
||||
producer.produce(region, record.clone()).await.unwrap();
|
||||
tokio::time::sleep(Duration::from_millis(150)).await;
|
||||
let current_latest_offset = topic_stats.get(&provider).unwrap().latest_offset;
|
||||
assert_eq!(current_latest_offset, record.len() as u64);
|
||||
assert_eq!(current_latest_offset, record.len() as u64 - 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,30 +33,34 @@ impl BackgroundProducerWorker {
|
||||
.context(error::GetOffsetSnafu {
|
||||
topic: &self.provider.topic,
|
||||
}) {
|
||||
Ok(offset) => match self.topic_stats.entry(self.provider.clone()) {
|
||||
dashmap::Entry::Occupied(mut occupied_entry) => {
|
||||
let offset = offset as u64;
|
||||
let stat = occupied_entry.get_mut();
|
||||
if stat.latest_offset < offset {
|
||||
stat.latest_offset = offset;
|
||||
Ok(highwatermark) => {
|
||||
// The highwatermark is the offset of the last record plus one.
|
||||
let offset = (highwatermark as u64).saturating_sub(1);
|
||||
|
||||
match self.topic_stats.entry(self.provider.clone()) {
|
||||
dashmap::Entry::Occupied(mut occupied_entry) => {
|
||||
let stat = occupied_entry.get_mut();
|
||||
if stat.latest_offset < offset {
|
||||
stat.latest_offset = offset;
|
||||
debug!(
|
||||
"Updated latest offset for topic {} to {}",
|
||||
self.provider.topic, offset
|
||||
);
|
||||
}
|
||||
}
|
||||
dashmap::Entry::Vacant(vacant_entry) => {
|
||||
vacant_entry.insert(TopicStat {
|
||||
latest_offset: offset,
|
||||
record_size: 0,
|
||||
record_num: 0,
|
||||
});
|
||||
debug!(
|
||||
"Updated latest offset for topic {} to {}",
|
||||
"Inserted latest offset for topic {} to {}",
|
||||
self.provider.topic, offset
|
||||
);
|
||||
}
|
||||
}
|
||||
dashmap::Entry::Vacant(vacant_entry) => {
|
||||
vacant_entry.insert(TopicStat {
|
||||
latest_offset: offset as u64,
|
||||
record_size: 0,
|
||||
record_num: 0,
|
||||
});
|
||||
debug!(
|
||||
"Inserted latest offset for topic {} to {}",
|
||||
self.provider.topic, offset
|
||||
);
|
||||
}
|
||||
},
|
||||
}
|
||||
Err(err) => {
|
||||
error!(err; "Failed to get latest offset for topic {}", self.provider.topic);
|
||||
}
|
||||
|
||||
@@ -461,6 +461,7 @@ fn build_connection_options(tls_config: Option<&TlsOption>) -> Result<Option<Con
|
||||
if matches!(tls_config.mode, TlsMode::Disable) {
|
||||
return Ok(None);
|
||||
}
|
||||
info!("Creating etcd client with TLS mode: {:?}", tls_config.mode);
|
||||
let mut etcd_tls_opts = TlsOptions::new();
|
||||
// Set CA certificate if provided
|
||||
if !tls_config.ca_cert_path.is_empty() {
|
||||
|
||||
@@ -152,13 +152,9 @@ fn align_ts(ts: i64, interval: Duration) -> i64 {
|
||||
impl PersistStatsHandler {
|
||||
/// Creates a new [`PersistStatsHandler`].
|
||||
pub fn new(inserter: Box<dyn Inserter>, mut persist_interval: Duration) -> Self {
|
||||
if persist_interval < Duration::from_secs(60) {
|
||||
warn!("persist_interval is less than 60 seconds, set to 60 seconds");
|
||||
persist_interval = Duration::from_secs(60);
|
||||
}
|
||||
if persist_interval.as_millis() == 0 {
|
||||
warn!("persist_interval as milliseconds is zero, set to 60 second");
|
||||
persist_interval = Duration::from_secs(60);
|
||||
if persist_interval < Duration::from_mins(10) {
|
||||
warn!("persist_interval is less than 10 minutes, set to 10 minutes");
|
||||
persist_interval = Duration::from_mins(10);
|
||||
}
|
||||
|
||||
Self {
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#![feature(assert_matches)]
|
||||
#![feature(hash_set_entry)]
|
||||
#![feature(let_chains)]
|
||||
#![feature(duration_constructors_lite)]
|
||||
#![feature(duration_constructors)]
|
||||
|
||||
pub mod bootstrap;
|
||||
|
||||
@@ -114,8 +114,8 @@ pub struct StatsPersistenceOptions {
|
||||
impl Default for StatsPersistenceOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
ttl: Duration::from_days(30),
|
||||
interval: Duration::from_secs(60),
|
||||
ttl: Duration::ZERO,
|
||||
interval: Duration::from_mins(10),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -784,6 +784,10 @@ impl Metasrv {
|
||||
&self.plugins
|
||||
}
|
||||
|
||||
pub fn started(&self) -> Arc<AtomicBool> {
|
||||
self.started.clone()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_ctx(&self) -> Context {
|
||||
let server_addr = self.options().grpc.server_addr.clone();
|
||||
|
||||
@@ -82,7 +82,7 @@ lazy_static! {
|
||||
.unwrap();
|
||||
/// The triggered region flush total counter.
|
||||
pub static ref METRIC_META_TRIGGERED_REGION_FLUSH_TOTAL: IntCounterVec =
|
||||
register_int_counter_vec!("meta_triggered_region_flush_total", "meta triggered region flush total", &["topic_name", "region_type"]).unwrap();
|
||||
register_int_counter_vec!("meta_triggered_region_flush_total", "meta triggered region flush total", &["topic_name"]).unwrap();
|
||||
|
||||
/// The triggered region checkpoint total counter.
|
||||
pub static ref METRIC_META_TRIGGERED_REGION_CHECKPOINT_TOTAL: IntCounterVec =
|
||||
|
||||
@@ -19,7 +19,6 @@ use api::v1::meta::MailboxMessage;
|
||||
use common_meta::distributed_time_constants::REGION_LEASE_SECS;
|
||||
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
|
||||
use common_meta::key::datanode_table::RegionInfo;
|
||||
use common_meta::wal_options_allocator::extract_topic_from_wal_options;
|
||||
use common_meta::RegionIdent;
|
||||
use common_procedure::{Context as ProcedureContext, Status};
|
||||
use common_telemetry::info;
|
||||
@@ -68,7 +67,6 @@ impl OpenCandidateRegion {
|
||||
async fn build_open_region_instruction(&self, ctx: &mut Context) -> Result<Instruction> {
|
||||
let pc = &ctx.persistent_ctx;
|
||||
let table_id = pc.region_id.table_id();
|
||||
let region_id = pc.region_id;
|
||||
let region_number = pc.region_id.region_number();
|
||||
let candidate_id = pc.to_peer.id;
|
||||
let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
|
||||
@@ -80,31 +78,18 @@ impl OpenCandidateRegion {
|
||||
engine,
|
||||
} = datanode_table_value.region_info.clone();
|
||||
|
||||
let checkpoint =
|
||||
if let Some(topic) = extract_topic_from_wal_options(region_id, ®ion_wal_options) {
|
||||
ctx.fetch_replay_checkpoint(&topic).await.ok().flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let open_instruction = Instruction::OpenRegion(
|
||||
OpenRegion::new(
|
||||
RegionIdent {
|
||||
datanode_id: candidate_id,
|
||||
table_id,
|
||||
region_number,
|
||||
engine,
|
||||
},
|
||||
®ion_storage_path,
|
||||
region_options,
|
||||
region_wal_options,
|
||||
true,
|
||||
)
|
||||
.with_replay_entry_id(checkpoint.map(|checkpoint| checkpoint.entry_id))
|
||||
.with_metadata_replay_entry_id(
|
||||
checkpoint.and_then(|checkpoint| checkpoint.metadata_entry_id),
|
||||
),
|
||||
);
|
||||
let open_instruction = Instruction::OpenRegion(OpenRegion::new(
|
||||
RegionIdent {
|
||||
datanode_id: candidate_id,
|
||||
table_id,
|
||||
region_number,
|
||||
engine,
|
||||
},
|
||||
®ion_storage_path,
|
||||
region_options,
|
||||
region_wal_options,
|
||||
true,
|
||||
));
|
||||
|
||||
Ok(open_instruction)
|
||||
}
|
||||
@@ -241,8 +226,6 @@ mod tests {
|
||||
region_options: Default::default(),
|
||||
region_wal_options: Default::default(),
|
||||
skip_wal_replay: true,
|
||||
replay_entry_id: None,
|
||||
metadata_replay_entry_id: None,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ use api::v1::meta::MailboxMessage;
|
||||
use common_meta::ddl::utils::parse_region_wal_options;
|
||||
use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply};
|
||||
use common_meta::lock_key::RemoteWalLock;
|
||||
use common_meta::wal_options_allocator::extract_topic_from_wal_options;
|
||||
use common_procedure::{Context as ProcedureContext, Status};
|
||||
use common_telemetry::{error, warn};
|
||||
use common_wal::options::WalOptions;
|
||||
@@ -111,23 +112,40 @@ impl UpgradeCandidateRegion {
|
||||
}
|
||||
|
||||
/// Builds upgrade region instruction.
|
||||
fn build_upgrade_region_instruction(
|
||||
async fn build_upgrade_region_instruction(
|
||||
&self,
|
||||
ctx: &Context,
|
||||
ctx: &mut Context,
|
||||
replay_timeout: Duration,
|
||||
) -> Instruction {
|
||||
) -> Result<Instruction> {
|
||||
let pc = &ctx.persistent_ctx;
|
||||
let region_id = pc.region_id;
|
||||
let last_entry_id = ctx.volatile_ctx.leader_region_last_entry_id;
|
||||
let metadata_last_entry_id = ctx.volatile_ctx.leader_region_metadata_last_entry_id;
|
||||
// Try our best to retrieve replay checkpoint.
|
||||
let datanode_table_value = ctx.get_from_peer_datanode_table_value().await.ok();
|
||||
let checkpoint = if let Some(topic) = datanode_table_value.as_ref().and_then(|v| {
|
||||
extract_topic_from_wal_options(region_id, &v.region_info.region_wal_options)
|
||||
}) {
|
||||
ctx.fetch_replay_checkpoint(&topic).await.ok().flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id,
|
||||
metadata_last_entry_id,
|
||||
replay_timeout: Some(replay_timeout),
|
||||
location_id: Some(ctx.persistent_ctx.from_peer.id),
|
||||
})
|
||||
let upgrade_instruction = Instruction::UpgradeRegion(
|
||||
UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id,
|
||||
metadata_last_entry_id,
|
||||
replay_timeout: Some(replay_timeout),
|
||||
location_id: Some(ctx.persistent_ctx.from_peer.id),
|
||||
replay_entry_id: None,
|
||||
metadata_replay_entry_id: None,
|
||||
}
|
||||
.with_replay_entry_id(checkpoint.map(|c| c.entry_id))
|
||||
.with_metadata_replay_entry_id(checkpoint.and_then(|c| c.metadata_entry_id)),
|
||||
);
|
||||
|
||||
Ok(upgrade_instruction)
|
||||
}
|
||||
|
||||
/// Tries to upgrade a candidate region.
|
||||
@@ -144,16 +162,19 @@ impl UpgradeCandidateRegion {
|
||||
/// - [UnexpectedInstructionReply](error::Error::UnexpectedInstructionReply) (impossible).
|
||||
/// - [ExceededDeadline](error::Error::ExceededDeadline)
|
||||
/// - Invalid JSON (impossible).
|
||||
async fn upgrade_region(&self, ctx: &Context) -> Result<()> {
|
||||
let pc = &ctx.persistent_ctx;
|
||||
let region_id = pc.region_id;
|
||||
let candidate = &pc.to_peer;
|
||||
async fn upgrade_region(&self, ctx: &mut Context) -> Result<()> {
|
||||
let operation_timeout =
|
||||
ctx.next_operation_timeout()
|
||||
.context(error::ExceededDeadlineSnafu {
|
||||
operation: "Upgrade region",
|
||||
})?;
|
||||
let upgrade_instruction = self.build_upgrade_region_instruction(ctx, operation_timeout);
|
||||
let upgrade_instruction = self
|
||||
.build_upgrade_region_instruction(ctx, operation_timeout)
|
||||
.await?;
|
||||
|
||||
let pc = &ctx.persistent_ctx;
|
||||
let region_id = pc.region_id;
|
||||
let candidate = &pc.to_peer;
|
||||
|
||||
let msg = MailboxMessage::json_message(
|
||||
&format!("Upgrade candidate region: {}", region_id),
|
||||
@@ -283,8 +304,12 @@ impl UpgradeCandidateRegion {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::assert_matches::assert_matches;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use common_meta::key::table_route::TableRouteValue;
|
||||
use common_meta::key::test_utils::new_test_table_info;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::router::{Region, RegionRoute};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::*;
|
||||
@@ -308,14 +333,33 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
async fn prepare_table_metadata(ctx: &Context, wal_options: HashMap<u32, String>) {
|
||||
let table_info =
|
||||
new_test_table_info(ctx.persistent_ctx.region_id.table_id(), vec![1]).into();
|
||||
let region_routes = vec![RegionRoute {
|
||||
region: Region::new_test(ctx.persistent_ctx.region_id),
|
||||
leader_peer: Some(ctx.persistent_ctx.from_peer.clone()),
|
||||
follower_peers: vec![ctx.persistent_ctx.to_peer.clone()],
|
||||
..Default::default()
|
||||
}];
|
||||
ctx.table_metadata_manager
|
||||
.create_table_metadata(
|
||||
table_info,
|
||||
TableRouteValue::physical(region_routes),
|
||||
wal_options,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_datanode_is_unreachable() {
|
||||
let state = UpgradeCandidateRegion::default();
|
||||
let persistent_context = new_persistent_context();
|
||||
let env = TestingEnv::new();
|
||||
let ctx = env.context_factory().new_context(persistent_context);
|
||||
|
||||
let err = state.upgrade_region(&ctx).await.unwrap_err();
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let err = state.upgrade_region(&mut ctx).await.unwrap_err();
|
||||
|
||||
assert_matches!(err, Error::PusherNotFound { .. });
|
||||
assert!(!err.is_retryable());
|
||||
@@ -328,7 +372,8 @@ mod tests {
|
||||
let to_peer_id = persistent_context.to_peer.id;
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.context_factory().new_context(persistent_context);
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
@@ -339,7 +384,7 @@ mod tests {
|
||||
|
||||
drop(rx);
|
||||
|
||||
let err = state.upgrade_region(&ctx).await.unwrap_err();
|
||||
let err = state.upgrade_region(&mut ctx).await.unwrap_err();
|
||||
|
||||
assert_matches!(err, Error::PushMessage { .. });
|
||||
assert!(!err.is_retryable());
|
||||
@@ -351,10 +396,11 @@ mod tests {
|
||||
let persistent_context = new_persistent_context();
|
||||
let env = TestingEnv::new();
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
ctx.volatile_ctx.metrics.operations_elapsed =
|
||||
ctx.persistent_ctx.timeout + Duration::from_secs(1);
|
||||
|
||||
let err = state.upgrade_region(&ctx).await.unwrap_err();
|
||||
let err = state.upgrade_region(&mut ctx).await.unwrap_err();
|
||||
|
||||
assert_matches!(err, Error::ExceededDeadline { .. });
|
||||
assert!(!err.is_retryable());
|
||||
@@ -367,7 +413,8 @@ mod tests {
|
||||
let to_peer_id = persistent_context.to_peer.id;
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.context_factory().new_context(persistent_context);
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
@@ -379,7 +426,7 @@ mod tests {
|
||||
|
||||
send_mock_reply(mailbox, rx, |id| Ok(new_close_region_reply(id)));
|
||||
|
||||
let err = state.upgrade_region(&ctx).await.unwrap_err();
|
||||
let err = state.upgrade_region(&mut ctx).await.unwrap_err();
|
||||
assert_matches!(err, Error::UnexpectedInstructionReply { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
@@ -391,7 +438,8 @@ mod tests {
|
||||
let to_peer_id = persistent_context.to_peer.id;
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.context_factory().new_context(persistent_context);
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
@@ -411,7 +459,7 @@ mod tests {
|
||||
))
|
||||
});
|
||||
|
||||
let err = state.upgrade_region(&ctx).await.unwrap_err();
|
||||
let err = state.upgrade_region(&mut ctx).await.unwrap_err();
|
||||
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
@@ -425,7 +473,8 @@ mod tests {
|
||||
let to_peer_id = persistent_context.to_peer.id;
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.context_factory().new_context(persistent_context);
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
@@ -439,7 +488,7 @@ mod tests {
|
||||
Ok(new_upgrade_region_reply(id, true, false, None))
|
||||
});
|
||||
|
||||
let err = state.upgrade_region(&ctx).await.unwrap_err();
|
||||
let err = state.upgrade_region(&mut ctx).await.unwrap_err();
|
||||
|
||||
assert_matches!(err, Error::Unexpected { .. });
|
||||
assert!(!err.is_retryable());
|
||||
@@ -457,7 +506,8 @@ mod tests {
|
||||
let to_peer_id = persistent_context.to_peer.id;
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.context_factory().new_context(persistent_context);
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
@@ -471,7 +521,7 @@ mod tests {
|
||||
Ok(new_upgrade_region_reply(id, false, true, None))
|
||||
});
|
||||
|
||||
let err = state.upgrade_region(&ctx).await.unwrap_err();
|
||||
let err = state.upgrade_region(&mut ctx).await.unwrap_err();
|
||||
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
@@ -491,7 +541,7 @@ mod tests {
|
||||
Ok(new_upgrade_region_reply(id, false, true, None))
|
||||
});
|
||||
|
||||
state.upgrade_region(&ctx).await.unwrap();
|
||||
state.upgrade_region(&mut ctx).await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -503,6 +553,7 @@ mod tests {
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
@@ -563,6 +614,7 @@ mod tests {
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
@@ -621,6 +673,7 @@ mod tests {
|
||||
|
||||
let mut env = TestingEnv::new();
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
prepare_table_metadata(&ctx, HashMap::default()).await;
|
||||
let mailbox_ctx = env.mailbox_context();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
ctx.volatile_ctx.metrics.operations_elapsed =
|
||||
|
||||
@@ -29,7 +29,6 @@ use common_time::util::current_time_millis;
|
||||
use common_wal::config::kafka::common::{
|
||||
DEFAULT_CHECKPOINT_TRIGGER_SIZE, DEFAULT_FLUSH_TRIGGER_SIZE,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
@@ -223,31 +222,34 @@ impl RegionFlushTrigger {
|
||||
&self,
|
||||
topic: &str,
|
||||
region_ids: &[RegionId],
|
||||
topic_regions: &HashMap<RegionId, TopicRegionValue>,
|
||||
leader_regions: &HashMap<RegionId, LeaderRegion>,
|
||||
) -> Result<()> {
|
||||
if region_ids.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let regions = region_ids
|
||||
.iter()
|
||||
.flat_map(|region_id| match leader_regions.get(region_id) {
|
||||
Some(leader_region) => {
|
||||
let entry_id = leader_region.manifest.replay_entry_id();
|
||||
let metadata_entry_id = leader_region.manifest.metadata_replay_entry_id();
|
||||
|
||||
Some((
|
||||
Some(leader_region) => should_persist_region_checkpoint(
|
||||
leader_region,
|
||||
topic_regions
|
||||
.get(region_id)
|
||||
.cloned()
|
||||
.and_then(|value| value.checkpoint),
|
||||
)
|
||||
.map(|checkpoint| {
|
||||
(
|
||||
TopicRegionKey::new(*region_id, topic),
|
||||
Some(TopicRegionValue::new(Some(ReplayCheckpoint::new(
|
||||
entry_id,
|
||||
metadata_entry_id,
|
||||
)))),
|
||||
))
|
||||
}
|
||||
Some(TopicRegionValue::new(Some(checkpoint))),
|
||||
)
|
||||
}),
|
||||
None => None,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// The`chunks` will panic if chunks_size is zero, so we return early if there are no regions to persist.
|
||||
if regions.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let max_txn_ops = self.table_metadata_manager.kv_backend().max_txn_ops();
|
||||
let batch_size = max_txn_ops.min(regions.len());
|
||||
for batch in regions.chunks(batch_size) {
|
||||
@@ -271,14 +273,14 @@ impl RegionFlushTrigger {
|
||||
latest_entry_id: u64,
|
||||
avg_record_size: usize,
|
||||
) -> Result<()> {
|
||||
let region_ids = self
|
||||
let topic_regions = self
|
||||
.table_metadata_manager
|
||||
.topic_region_manager()
|
||||
.regions(topic)
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)?;
|
||||
|
||||
if region_ids.is_empty() {
|
||||
if topic_regions.is_empty() {
|
||||
debug!("No regions found for topic: {}", topic);
|
||||
return Ok(());
|
||||
}
|
||||
@@ -286,7 +288,7 @@ impl RegionFlushTrigger {
|
||||
// Filters regions need to persist checkpoints.
|
||||
let regions_to_persist = filter_regions_by_replay_size(
|
||||
topic,
|
||||
region_ids
|
||||
topic_regions
|
||||
.iter()
|
||||
.map(|(region_id, value)| (*region_id, value.min_entry_id().unwrap_or_default())),
|
||||
avg_record_size as u64,
|
||||
@@ -295,33 +297,25 @@ impl RegionFlushTrigger {
|
||||
);
|
||||
let region_manifests = self
|
||||
.leader_region_registry
|
||||
.batch_get(region_ids.keys().cloned());
|
||||
.batch_get(topic_regions.keys().cloned());
|
||||
|
||||
if let Err(err) = self
|
||||
.persist_region_checkpoints(topic, ®ions_to_persist, ®ion_manifests)
|
||||
.persist_region_checkpoints(
|
||||
topic,
|
||||
®ions_to_persist,
|
||||
&topic_regions,
|
||||
®ion_manifests,
|
||||
)
|
||||
.await
|
||||
{
|
||||
error!(err; "Failed to persist region checkpoints for topic: {}", topic);
|
||||
}
|
||||
|
||||
let (inactive_regions, active_regions): (Vec<_>, Vec<_>) = region_manifests
|
||||
let regions = region_manifests
|
||||
.into_iter()
|
||||
.partition_map(|(region_id, region)| {
|
||||
if !region.manifest.is_inactive() {
|
||||
itertools::Either::Left((region_id, region.manifest.prunable_entry_id()))
|
||||
} else {
|
||||
itertools::Either::Right((region_id, region.manifest.prunable_entry_id()))
|
||||
}
|
||||
});
|
||||
|
||||
let min_entry_id = inactive_regions
|
||||
.iter()
|
||||
.min_by_key(|(_, entry_id)| *entry_id);
|
||||
let min_entry_id = active_regions
|
||||
.iter()
|
||||
.min_by_key(|(_, entry_id)| *entry_id)
|
||||
.or(min_entry_id);
|
||||
|
||||
.map(|(region_id, region)| (region_id, region.manifest.prunable_entry_id()))
|
||||
.collect::<Vec<_>>();
|
||||
let min_entry_id = regions.iter().min_by_key(|(_, entry_id)| *entry_id);
|
||||
if let Some((_, min_entry_id)) = min_entry_id {
|
||||
let replay_size = (latest_entry_id.saturating_sub(*min_entry_id))
|
||||
.saturating_mul(avg_record_size as u64);
|
||||
@@ -331,45 +325,28 @@ impl RegionFlushTrigger {
|
||||
}
|
||||
|
||||
// Selects regions to flush from the set of active regions.
|
||||
let mut regions_to_flush = filter_regions_by_replay_size(
|
||||
let regions_to_flush = filter_regions_by_replay_size(
|
||||
topic,
|
||||
active_regions.into_iter(),
|
||||
regions.into_iter(),
|
||||
avg_record_size as u64,
|
||||
latest_entry_id,
|
||||
self.flush_trigger_size,
|
||||
);
|
||||
|
||||
let active_regions_num = regions_to_flush.len();
|
||||
// Selects regions to flush from the set of inactive regions.
|
||||
// For inactive regions, we use a lower flush trigger size (half of the normal size)
|
||||
// to encourage more aggressive flushing to update the region's topic latest entry id.
|
||||
let inactive_regions_to_flush = filter_regions_by_replay_size(
|
||||
topic,
|
||||
inactive_regions.into_iter(),
|
||||
avg_record_size as u64,
|
||||
latest_entry_id,
|
||||
self.flush_trigger_size / 2,
|
||||
);
|
||||
let inactive_regions_num = inactive_regions_to_flush.len();
|
||||
regions_to_flush.extend(inactive_regions_to_flush);
|
||||
|
||||
// Sends flush instructions to datanodes.
|
||||
if !regions_to_flush.is_empty() {
|
||||
self.send_flush_instructions(®ions_to_flush).await?;
|
||||
debug!(
|
||||
"Sent {} flush instructions to datanodes for topic: '{}' ({} inactive regions)",
|
||||
"Sent {} flush instructions to datanodes for topic: '{}', regions: {:?}",
|
||||
regions_to_flush.len(),
|
||||
topic,
|
||||
inactive_regions_num,
|
||||
regions_to_flush,
|
||||
);
|
||||
}
|
||||
|
||||
metrics::METRIC_META_TRIGGERED_REGION_FLUSH_TOTAL
|
||||
.with_label_values(&[topic, "active"])
|
||||
.inc_by(active_regions_num as u64);
|
||||
metrics::METRIC_META_TRIGGERED_REGION_FLUSH_TOTAL
|
||||
.with_label_values(&[topic, "inactive"])
|
||||
.inc_by(inactive_regions_num as u64);
|
||||
.with_label_values(&[topic])
|
||||
.inc_by(regions_to_flush.len() as u64);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -408,6 +385,26 @@ impl RegionFlushTrigger {
|
||||
}
|
||||
}
|
||||
|
||||
/// Determines whether a region checkpoint should be persisted based on current and persisted state.
|
||||
fn should_persist_region_checkpoint(
|
||||
current: &LeaderRegion,
|
||||
persisted: Option<ReplayCheckpoint>,
|
||||
) -> Option<ReplayCheckpoint> {
|
||||
let new_checkpoint = ReplayCheckpoint::new(
|
||||
current.manifest.replay_entry_id(),
|
||||
current.manifest.metadata_replay_entry_id(),
|
||||
);
|
||||
|
||||
let Some(persisted) = persisted else {
|
||||
return Some(new_checkpoint);
|
||||
};
|
||||
|
||||
if new_checkpoint > persisted {
|
||||
return Some(new_checkpoint);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Filter regions based on the estimated replay size.
|
||||
///
|
||||
/// Returns the regions if its estimated replay size exceeds the given threshold.
|
||||
@@ -496,6 +493,7 @@ fn is_recent(timestamp: i64, now: i64, duration: Duration) -> bool {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_meta::region_registry::LeaderRegionManifestInfo;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::*;
|
||||
@@ -626,4 +624,92 @@ mod tests {
|
||||
// Only regions 1,1 and 1,2 should be flushed
|
||||
assert_eq!(result, vec![region_id(1, 1), region_id(1, 2)]);
|
||||
}
|
||||
|
||||
fn metric_leader_region(replay_entry_id: u64, metadata_replay_entry_id: u64) -> LeaderRegion {
|
||||
LeaderRegion {
|
||||
datanode_id: 1,
|
||||
manifest: LeaderRegionManifestInfo::Metric {
|
||||
data_manifest_version: 1,
|
||||
data_flushed_entry_id: replay_entry_id,
|
||||
data_topic_latest_entry_id: 0,
|
||||
metadata_manifest_version: 1,
|
||||
metadata_flushed_entry_id: metadata_replay_entry_id,
|
||||
metadata_topic_latest_entry_id: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn mito_leader_region(replay_entry_id: u64) -> LeaderRegion {
|
||||
LeaderRegion {
|
||||
datanode_id: 1,
|
||||
manifest: LeaderRegionManifestInfo::Mito {
|
||||
manifest_version: 1,
|
||||
flushed_entry_id: replay_entry_id,
|
||||
topic_latest_entry_id: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_persist_region_checkpoint() {
|
||||
// `persisted` is none
|
||||
let current = metric_leader_region(100, 10);
|
||||
let result = should_persist_region_checkpoint(¤t, None).unwrap();
|
||||
assert_eq!(result, ReplayCheckpoint::new(100, Some(10)));
|
||||
|
||||
// `persisted.entry_id` is less than `current.manifest.replay_entry_id()`
|
||||
let current = mito_leader_region(100);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(90, None)))
|
||||
.unwrap();
|
||||
assert_eq!(result, ReplayCheckpoint::new(100, None));
|
||||
|
||||
let current = metric_leader_region(100, 10);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(90, Some(10))))
|
||||
.unwrap();
|
||||
assert_eq!(result, ReplayCheckpoint::new(100, Some(10)));
|
||||
|
||||
// `persisted.metadata_entry_id` is less than `current.manifest.metadata_replay_entry_id()`
|
||||
let current = metric_leader_region(100, 10);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(100, Some(8))))
|
||||
.unwrap();
|
||||
assert_eq!(result, ReplayCheckpoint::new(100, Some(10)));
|
||||
|
||||
// `persisted.metadata_entry_id` is none
|
||||
let current = metric_leader_region(100, 10);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(100, None)))
|
||||
.unwrap();
|
||||
assert_eq!(result, ReplayCheckpoint::new(100, Some(10)));
|
||||
|
||||
// `current.manifest.metadata_replay_entry_id()` is none
|
||||
let current = mito_leader_region(100);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(100, Some(8))))
|
||||
.is_none();
|
||||
assert!(result);
|
||||
|
||||
// `persisted.entry_id` is equal to `current.manifest.replay_entry_id()`
|
||||
let current = metric_leader_region(100, 10);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(100, Some(10))));
|
||||
assert!(result.is_none());
|
||||
let current = mito_leader_region(100);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(100, None)));
|
||||
assert!(result.is_none());
|
||||
|
||||
// `persisted.entry_id` is less than `current.manifest.replay_entry_id()`
|
||||
// `persisted.metadata_entry_id` is greater than `current.manifest.metadata_replay_entry_id()`
|
||||
let current = metric_leader_region(80, 11);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(90, Some(10))));
|
||||
assert!(result.is_none());
|
||||
let current = mito_leader_region(80);
|
||||
let result =
|
||||
should_persist_region_checkpoint(¤t, Some(ReplayCheckpoint::new(90, Some(10))));
|
||||
assert!(result.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ impl store_server::Store for Metasrv {
|
||||
let req = req.into_inner();
|
||||
|
||||
let _timer = METRIC_META_KV_REQUEST_ELAPSED
|
||||
.with_label_values(&[self.kv_backend().name(), "batch_pub"])
|
||||
.with_label_values(&[self.kv_backend().name(), "batch_put"])
|
||||
.start_timer();
|
||||
|
||||
let req: BatchPutRequest = req.into();
|
||||
|
||||
@@ -15,7 +15,9 @@
|
||||
use common_telemetry::debug;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::region_engine::RegionEngine;
|
||||
use store_api::region_request::{AffectedRows, RegionCatchupRequest, RegionRequest};
|
||||
use store_api::region_request::{
|
||||
AffectedRows, RegionCatchupRequest, RegionRequest, ReplayCheckpoint,
|
||||
};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::engine::MetricEngineInner;
|
||||
@@ -59,6 +61,10 @@ impl MetricEngineInner {
|
||||
entry_id: req.metadata_entry_id,
|
||||
metadata_entry_id: None,
|
||||
location_id: req.location_id,
|
||||
checkpoint: req.checkpoint.map(|c| ReplayCheckpoint {
|
||||
entry_id: c.metadata_entry_id.unwrap_or_default(),
|
||||
metadata_entry_id: None,
|
||||
}),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -73,6 +79,10 @@ impl MetricEngineInner {
|
||||
entry_id: req.entry_id,
|
||||
metadata_entry_id: None,
|
||||
location_id: req.location_id,
|
||||
checkpoint: req.checkpoint.map(|c| ReplayCheckpoint {
|
||||
entry_id: c.entry_id,
|
||||
metadata_entry_id: None,
|
||||
}),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -127,10 +127,10 @@ mod tests {
|
||||
r#"
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3157, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20) }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3157, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10) }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3201, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8) }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3185, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4) }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3429, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8) }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3413, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4) }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3157, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10) }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3185, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4) }"#
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3413, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4) }"#
|
||||
);
|
||||
|
||||
// list from storage
|
||||
|
||||
@@ -19,7 +19,7 @@ common-recordbatch.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
datatypes.workspace = true
|
||||
memcomparable = "0.2"
|
||||
memcomparable = { git = "https://github.com/v0y4g3r/memcomparable.git", rev = "a07122dc03556bbd88ad66234cbea7efd3b23efb" }
|
||||
paste.workspace = true
|
||||
serde.workspace = true
|
||||
snafu.workspace = true
|
||||
|
||||
@@ -50,7 +50,6 @@ index.workspace = true
|
||||
itertools.workspace = true
|
||||
lazy_static = "1.4"
|
||||
log-store = { workspace = true }
|
||||
memcomparable = "0.2"
|
||||
mito-codec.workspace = true
|
||||
moka = { workspace = true, features = ["sync", "future"] }
|
||||
object-store.workspace = true
|
||||
|
||||
@@ -189,6 +189,11 @@ impl AccessLayer {
|
||||
&self.puffin_manager_factory
|
||||
}
|
||||
|
||||
/// Returns the intermediate manager.
|
||||
pub fn intermediate_manager(&self) -> &IntermediateManager {
|
||||
&self.intermediate_manager
|
||||
}
|
||||
|
||||
/// Deletes a SST file (and its index file if it has one) with given file id.
|
||||
pub(crate) async fn delete_sst(&self, file_meta: &FileMeta) -> Result<()> {
|
||||
let path = location::sst_file_path(&self.table_dir, file_meta.file_id(), self.path_type);
|
||||
|
||||
@@ -1077,6 +1077,7 @@ mod tests {
|
||||
let staging_manifest_ctx = {
|
||||
let manager = RegionManifestManager::new(
|
||||
version_control.current().version.metadata.clone(),
|
||||
0,
|
||||
RegionManifestOptions {
|
||||
manifest_dir: "".to_string(),
|
||||
object_store: env.access_layer.object_store().clone(),
|
||||
|
||||
@@ -175,6 +175,10 @@ impl FileGroup {
|
||||
pub(crate) fn into_files(self) -> impl Iterator<Item = FileHandle> {
|
||||
self.files.into_iter()
|
||||
}
|
||||
|
||||
pub(crate) fn is_all_level_0(&self) -> bool {
|
||||
self.files.iter().all(|f| f.level() == 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Ranged for FileGroup {
|
||||
|
||||
@@ -42,6 +42,25 @@ pub fn new_file_handle_with_sequence(
|
||||
end_ts_millis: i64,
|
||||
level: Level,
|
||||
sequence: u64,
|
||||
) -> FileHandle {
|
||||
new_file_handle_with_size_and_sequence(
|
||||
file_id,
|
||||
start_ts_millis,
|
||||
end_ts_millis,
|
||||
level,
|
||||
sequence,
|
||||
0,
|
||||
)
|
||||
}
|
||||
|
||||
/// Test util to create file handles with custom size.
|
||||
pub fn new_file_handle_with_size_and_sequence(
|
||||
file_id: FileId,
|
||||
start_ts_millis: i64,
|
||||
end_ts_millis: i64,
|
||||
level: Level,
|
||||
sequence: u64,
|
||||
file_size: u64,
|
||||
) -> FileHandle {
|
||||
let file_purger = new_noop_file_purger();
|
||||
FileHandle::new(
|
||||
@@ -53,7 +72,7 @@ pub fn new_file_handle_with_sequence(
|
||||
Timestamp::new_millisecond(end_ts_millis),
|
||||
),
|
||||
level,
|
||||
file_size: 0,
|
||||
file_size,
|
||||
available_indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
|
||||
@@ -64,11 +64,32 @@ impl TwcsPicker {
|
||||
continue;
|
||||
}
|
||||
let mut files_to_merge: Vec<_> = files.files().cloned().collect();
|
||||
|
||||
// Filter out large files in append mode - they won't benefit from compaction
|
||||
if self.append_mode {
|
||||
if let Some(max_size) = self.max_output_file_size {
|
||||
let (kept_files, ignored_files) = files_to_merge
|
||||
.into_iter()
|
||||
.partition(|fg| fg.size() <= max_size as usize && fg.is_all_level_0());
|
||||
files_to_merge = kept_files;
|
||||
info!(
|
||||
"Skipped {} large files in append mode for region {}, window {}, max_size: {}",
|
||||
ignored_files.len(),
|
||||
region_id,
|
||||
window,
|
||||
max_size
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let sorted_runs = find_sorted_runs(&mut files_to_merge);
|
||||
let found_runs = sorted_runs.len();
|
||||
// We only remove deletion markers if we found less than 2 runs and not in append mode.
|
||||
// because after compaction there will be no overlapping files.
|
||||
let filter_deleted = !files.overlapping && found_runs <= 2 && !self.append_mode;
|
||||
if found_runs == 0 {
|
||||
return output;
|
||||
}
|
||||
|
||||
let inputs = if found_runs > 1 {
|
||||
reduce_runs(sorted_runs)
|
||||
@@ -330,7 +351,9 @@ mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::*;
|
||||
use crate::compaction::test_util::{new_file_handle, new_file_handle_with_sequence};
|
||||
use crate::compaction::test_util::{
|
||||
new_file_handle, new_file_handle_with_sequence, new_file_handle_with_size_and_sequence,
|
||||
};
|
||||
use crate::sst::file::{FileId, Level};
|
||||
|
||||
#[test]
|
||||
@@ -766,5 +789,45 @@ mod tests {
|
||||
.check();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_mode_filter_large_files() {
|
||||
let file_ids = (0..4).map(|_| FileId::random()).collect::<Vec<_>>();
|
||||
let max_output_file_size = 1000u64;
|
||||
|
||||
// Create files with different sizes
|
||||
let small_file_1 = new_file_handle_with_size_and_sequence(file_ids[0], 0, 999, 0, 1, 500);
|
||||
let large_file_1 = new_file_handle_with_size_and_sequence(file_ids[1], 0, 999, 0, 2, 1500);
|
||||
let small_file_2 = new_file_handle_with_size_and_sequence(file_ids[2], 0, 999, 0, 3, 800);
|
||||
let large_file_2 = new_file_handle_with_size_and_sequence(file_ids[3], 0, 999, 0, 4, 2000);
|
||||
|
||||
// Create file groups (each file is in its own group due to different sequences)
|
||||
let mut files_to_merge = vec![
|
||||
FileGroup::new_with_file(small_file_1),
|
||||
FileGroup::new_with_file(large_file_1),
|
||||
FileGroup::new_with_file(small_file_2),
|
||||
FileGroup::new_with_file(large_file_2),
|
||||
];
|
||||
|
||||
// Test filtering logic directly
|
||||
let original_count = files_to_merge.len();
|
||||
|
||||
// Apply append mode filtering
|
||||
files_to_merge.retain(|fg| fg.size() <= max_output_file_size as usize);
|
||||
|
||||
// Should have filtered out 2 large files, leaving 2 small files
|
||||
assert_eq!(files_to_merge.len(), 2);
|
||||
assert_eq!(original_count, 4);
|
||||
|
||||
// Verify the remaining files are the small ones
|
||||
for fg in &files_to_merge {
|
||||
assert!(
|
||||
fg.size() <= max_output_file_size as usize,
|
||||
"File size {} should be <= {}",
|
||||
fg.size(),
|
||||
max_output_file_size
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(hl): TTL tester that checks if get_expired_ssts function works as expected.
|
||||
}
|
||||
|
||||
@@ -53,6 +53,8 @@ mod prune_test;
|
||||
#[cfg(test)]
|
||||
mod row_selector_test;
|
||||
#[cfg(test)]
|
||||
mod scan_corrupt;
|
||||
#[cfg(test)]
|
||||
mod scan_test;
|
||||
#[cfg(test)]
|
||||
mod set_role_state_test;
|
||||
|
||||
@@ -127,8 +127,7 @@ async fn test_catchup_with_last_entry_id(factory: Option<LogStoreFactory>) {
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: false,
|
||||
entry_id: last_entry_id,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -160,8 +159,7 @@ async fn test_catchup_with_last_entry_id(factory: Option<LogStoreFactory>) {
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: true,
|
||||
entry_id: last_entry_id,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -251,8 +249,7 @@ async fn test_catchup_with_incorrect_last_entry_id(factory: Option<LogStoreFacto
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: false,
|
||||
entry_id: incorrect_last_entry_id,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -269,8 +266,7 @@ async fn test_catchup_with_incorrect_last_entry_id(factory: Option<LogStoreFacto
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: false,
|
||||
entry_id: incorrect_last_entry_id,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -340,9 +336,7 @@ async fn test_catchup_without_last_entry_id(factory: Option<LogStoreFactory>) {
|
||||
region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: false,
|
||||
entry_id: None,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -372,9 +366,7 @@ async fn test_catchup_without_last_entry_id(factory: Option<LogStoreFactory>) {
|
||||
region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: true,
|
||||
entry_id: None,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -465,9 +457,7 @@ async fn test_catchup_with_manifest_update(factory: Option<LogStoreFactory>) {
|
||||
region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: false,
|
||||
entry_id: None,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -503,9 +493,7 @@ async fn test_catchup_with_manifest_update(factory: Option<LogStoreFactory>) {
|
||||
region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: true,
|
||||
entry_id: None,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -652,9 +640,7 @@ async fn test_local_catchup(factory: Option<LogStoreFactory>) {
|
||||
region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: true,
|
||||
entry_id: None,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
@@ -715,9 +701,7 @@ async fn test_catchup_not_exist() {
|
||||
non_exist_region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: true,
|
||||
entry_id: None,
|
||||
metadata_entry_id: None,
|
||||
location_id: None,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
112
src/mito2/src/engine/scan_corrupt.rs
Normal file
112
src/mito2/src/engine/scan_corrupt.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use api::v1::helper::row;
|
||||
use api::v1::value::ValueData;
|
||||
use api::v1::Rows;
|
||||
use datatypes::value::Value;
|
||||
use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodec};
|
||||
use parquet::file::statistics::Statistics;
|
||||
use store_api::region_engine::RegionEngine;
|
||||
use store_api::region_request::{PathType, RegionRequest};
|
||||
use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::config::MitoConfig;
|
||||
use crate::sst::parquet::reader::ParquetReaderBuilder;
|
||||
use crate::test_util;
|
||||
use crate::test_util::{CreateRequestBuilder, TestEnv};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_corrupt() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let mut env = TestEnv::with_prefix("test_write_stats_with_long_string_value").await;
|
||||
let engine = env.create_engine(MitoConfig::default()).await;
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let request = CreateRequestBuilder::new().build();
|
||||
let table_dir = request.table_dir.clone();
|
||||
let column_schemas = test_util::rows_schema(&request);
|
||||
|
||||
engine
|
||||
.handle_request(region_id, RegionRequest::Create(request))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let build_rows = |start: i32, end: i32| {
|
||||
(start..end)
|
||||
.map(|i| {
|
||||
row(vec![
|
||||
ValueData::StringValue(i.to_string().repeat(128)),
|
||||
ValueData::F64Value(i as f64),
|
||||
ValueData::TimestampMillisecondValue(i as i64 * 1000),
|
||||
])
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
let put_rows = async |start, end| {
|
||||
let rows = Rows {
|
||||
schema: column_schemas.clone(),
|
||||
rows: build_rows(start, end),
|
||||
};
|
||||
test_util::put_rows(&engine, region_id, rows).await;
|
||||
test_util::flush_region(&engine, region_id, None).await;
|
||||
};
|
||||
put_rows(0, 3).await;
|
||||
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
|
||||
let version = region.version();
|
||||
let file = version
|
||||
.ssts
|
||||
.levels()
|
||||
.iter()
|
||||
.flat_map(|l| l.files.values())
|
||||
.next()
|
||||
.unwrap();
|
||||
|
||||
let object_store = env.get_object_store().unwrap();
|
||||
let reader = ParquetReaderBuilder::new(
|
||||
table_dir.clone(),
|
||||
PathType::Bare,
|
||||
file.clone(),
|
||||
object_store.clone(),
|
||||
)
|
||||
.build()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let codec = DensePrimaryKeyCodec::new(&version.metadata);
|
||||
for r in reader.parquet_metadata().row_groups() {
|
||||
for c in r.columns() {
|
||||
if c.column_descr().name() == PRIMARY_KEY_COLUMN_NAME {
|
||||
let stats = c.statistics().unwrap();
|
||||
let Statistics::ByteArray(b) = stats else {
|
||||
unreachable!()
|
||||
};
|
||||
let min = codec
|
||||
.decode_leftmost(b.min_bytes_opt().unwrap())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(Value::String("0".repeat(128).into()), min);
|
||||
|
||||
let max = codec
|
||||
.decode_leftmost(b.max_bytes_opt().unwrap())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(Value::String("2".repeat(128).into()), max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -27,8 +27,8 @@ use crate::error::{
|
||||
self, InstallManifestToSnafu, NoCheckpointSnafu, NoManifestsSnafu, RegionStoppedSnafu, Result,
|
||||
};
|
||||
use crate::manifest::action::{
|
||||
RegionChange, RegionCheckpoint, RegionManifest, RegionManifestBuilder, RegionMetaAction,
|
||||
RegionMetaActionList,
|
||||
RegionChange, RegionCheckpoint, RegionEdit, RegionManifest, RegionManifestBuilder,
|
||||
RegionMetaAction, RegionMetaActionList,
|
||||
};
|
||||
use crate::manifest::checkpointer::Checkpointer;
|
||||
use crate::manifest::storage::{
|
||||
@@ -150,6 +150,7 @@ impl RegionManifestManager {
|
||||
/// Constructs a region's manifest and persist it.
|
||||
pub async fn new(
|
||||
metadata: RegionMetadataRef,
|
||||
flushed_entry_id: u64,
|
||||
options: RegionManifestOptions,
|
||||
total_manifest_size: Arc<AtomicU64>,
|
||||
manifest_version: Arc<AtomicU64>,
|
||||
@@ -163,8 +164,8 @@ impl RegionManifestManager {
|
||||
);
|
||||
|
||||
info!(
|
||||
"Creating region manifest in {} with metadata {:?}",
|
||||
options.manifest_dir, metadata
|
||||
"Creating region manifest in {} with metadata {:?}, flushed_entry_id: {}",
|
||||
options.manifest_dir, metadata, flushed_entry_id
|
||||
);
|
||||
|
||||
let version = MIN_VERSION;
|
||||
@@ -184,9 +185,21 @@ impl RegionManifestManager {
|
||||
options.manifest_dir, manifest
|
||||
);
|
||||
|
||||
let mut actions = vec![RegionMetaAction::Change(RegionChange { metadata })];
|
||||
if flushed_entry_id > 0 {
|
||||
actions.push(RegionMetaAction::Edit(RegionEdit {
|
||||
files_to_add: vec![],
|
||||
files_to_remove: vec![],
|
||||
timestamp_ms: None,
|
||||
compaction_time_window: None,
|
||||
flushed_entry_id: Some(flushed_entry_id),
|
||||
flushed_sequence: None,
|
||||
}));
|
||||
}
|
||||
|
||||
// Persist region change.
|
||||
let action_list =
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { metadata }));
|
||||
let action_list = RegionMetaActionList::new(actions);
|
||||
|
||||
// New region is not in staging mode.
|
||||
// TODO(ruihang): add staging mode support if needed.
|
||||
store.save(version, &action_list.encode()?, false).await?;
|
||||
|
||||
@@ -554,6 +554,8 @@ impl BulkPartEncoder {
|
||||
WriterProperties::builder()
|
||||
.set_write_batch_size(row_group_size)
|
||||
.set_max_row_group_size(row_group_size)
|
||||
.set_column_index_truncate_length(None)
|
||||
.set_statistics_truncate_length(None)
|
||||
.build(),
|
||||
);
|
||||
Self {
|
||||
|
||||
@@ -774,7 +774,9 @@ impl<'a> DataPartEncoder<'a> {
|
||||
.set_column_encoding(sequence_col.clone(), Encoding::DELTA_BINARY_PACKED)
|
||||
.set_column_dictionary_enabled(sequence_col, false)
|
||||
.set_column_encoding(op_type_col.clone(), Encoding::DELTA_BINARY_PACKED)
|
||||
.set_column_dictionary_enabled(op_type_col, true);
|
||||
.set_column_dictionary_enabled(op_type_col, true)
|
||||
.set_column_index_truncate_length(None)
|
||||
.set_statistics_truncate_length(None);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream};
|
||||
use crate::read::{Batch, ScannerMetrics};
|
||||
|
||||
/// Timeout to send a batch to a sender.
|
||||
const SEND_TIMEOUT: Duration = Duration::from_millis(10);
|
||||
const SEND_TIMEOUT: Duration = Duration::from_micros(100);
|
||||
|
||||
/// List of receivers.
|
||||
type ReceiverList = Vec<Option<Receiver<Result<SeriesBatch>>>>;
|
||||
|
||||
@@ -1122,6 +1122,7 @@ mod tests {
|
||||
let staging_ctx = {
|
||||
let manager = RegionManifestManager::new(
|
||||
version_control.current().version.metadata.clone(),
|
||||
0,
|
||||
RegionManifestOptions {
|
||||
manifest_dir: "".to_string(),
|
||||
object_store: env.access_layer.object_store().clone(),
|
||||
@@ -1187,6 +1188,7 @@ mod tests {
|
||||
|
||||
let manager = RegionManifestManager::new(
|
||||
metadata.clone(),
|
||||
0,
|
||||
RegionManifestOptions {
|
||||
manifest_dir: "".to_string(),
|
||||
object_store: access_layer.object_store().clone(),
|
||||
|
||||
@@ -238,8 +238,11 @@ impl RegionOpener {
|
||||
// Create a manifest manager for this region and writes regions to the manifest file.
|
||||
let region_manifest_options =
|
||||
Self::manifest_options(config, &options, ®ion_dir, &self.object_store_manager)?;
|
||||
// For remote WAL, we need to set flushed_entry_id to current topic's latest entry id.
|
||||
let flushed_entry_id = provider.initial_flushed_entry_id::<S>(wal.store());
|
||||
let manifest_manager = RegionManifestManager::new(
|
||||
metadata.clone(),
|
||||
flushed_entry_id,
|
||||
region_manifest_options,
|
||||
self.stats.total_manifest_size.clone(),
|
||||
self.stats.manifest_version.clone(),
|
||||
@@ -439,7 +442,7 @@ impl RegionOpener {
|
||||
.build();
|
||||
let flushed_entry_id = version.flushed_entry_id;
|
||||
let version_control = Arc::new(VersionControl::new(version));
|
||||
if !self.skip_wal_replay {
|
||||
let topic_latest_entry_id = if !self.skip_wal_replay {
|
||||
let replay_from_entry_id = self
|
||||
.replay_checkpoint
|
||||
.unwrap_or_default()
|
||||
@@ -461,14 +464,26 @@ impl RegionOpener {
|
||||
on_region_opened,
|
||||
)
|
||||
.await?;
|
||||
// For remote WAL, we need to set topic_latest_entry_id to current topic's latest entry id.
|
||||
// Only set after the WAL replay is completed.
|
||||
let topic_latest_entry_id = if provider.is_remote_wal()
|
||||
&& version_control.current().version.memtables.is_empty()
|
||||
{
|
||||
wal.store().latest_entry_id(&provider).unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
topic_latest_entry_id
|
||||
} else {
|
||||
info!(
|
||||
"Skip the WAL replay for region: {}, manifest version: {}, flushed_entry_id: {}",
|
||||
region_id, manifest.manifest_version, flushed_entry_id
|
||||
);
|
||||
}
|
||||
let now = self.time_provider.current_time_millis();
|
||||
|
||||
0
|
||||
};
|
||||
let now = self.time_provider.current_time_millis();
|
||||
let region = MitoRegion {
|
||||
region_id: self.region_id,
|
||||
version_control,
|
||||
@@ -483,7 +498,7 @@ impl RegionOpener {
|
||||
last_flush_millis: AtomicI64::new(now),
|
||||
last_compaction_millis: AtomicI64::new(now),
|
||||
time_provider: self.time_provider.clone(),
|
||||
topic_latest_entry_id: AtomicU64::new(0),
|
||||
topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id),
|
||||
write_bytes: Arc::new(AtomicU64::new(0)),
|
||||
memtable_builder,
|
||||
stats: self.stats.clone(),
|
||||
@@ -713,8 +728,8 @@ where
|
||||
|
||||
let series_count = version_control.current().series_count();
|
||||
info!(
|
||||
"Replay WAL for region: {}, rows recovered: {}, last entry id: {}, total timeseries replayed: {}, elapsed: {:?}",
|
||||
region_id, rows_replayed, last_entry_id, series_count, now.elapsed()
|
||||
"Replay WAL for region: {}, provider: {:?}, rows recovered: {}, replay from entry id: {}, last entry id: {}, total timeseries replayed: {}, elapsed: {:?}",
|
||||
region_id, provider, rows_replayed, replay_from_entry_id, last_entry_id, series_count, now.elapsed()
|
||||
);
|
||||
Ok(last_entry_id)
|
||||
}
|
||||
|
||||
@@ -371,7 +371,7 @@ impl VersionBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets truncated entty id.
|
||||
/// Sets truncated entry id.
|
||||
pub(crate) fn truncated_entry_id(mut self, entry_id: Option<EntryId>) -> Self {
|
||||
self.truncated_entry_id = entry_id;
|
||||
self
|
||||
|
||||
@@ -319,6 +319,10 @@ impl FileHandle {
|
||||
pub fn num_rows(&self) -> usize {
|
||||
self.inner.meta.num_rows as usize
|
||||
}
|
||||
|
||||
pub fn level(&self) -> Level {
|
||||
self.inner.meta.level
|
||||
}
|
||||
}
|
||||
|
||||
/// Inner data of [FileHandle].
|
||||
|
||||
@@ -137,6 +137,14 @@ impl FilePurger for LocalFilePurger {
|
||||
error!(e; "Failed to purge stager with index file, file_id: {}, region: {}",
|
||||
file_meta.file_id(), file_meta.region_id);
|
||||
}
|
||||
let file_id = file_meta.file_id();
|
||||
if let Err(e) = sst_layer
|
||||
.intermediate_manager()
|
||||
.prune_sst_dir(&file_id.region_id(), &file_id.file_id())
|
||||
.await
|
||||
{
|
||||
error!(e; "Failed to prune intermediate sst directory, region_id: {}, file_id: {}", file_id.region_id(), file_id.file_id());
|
||||
}
|
||||
})) {
|
||||
error!(e; "Failed to schedule the file purge request");
|
||||
}
|
||||
|
||||
@@ -110,6 +110,7 @@ pub struct Indexer {
|
||||
last_mem_fulltext_index: usize,
|
||||
bloom_filter_indexer: Option<BloomFilterIndexer>,
|
||||
last_mem_bloom_filter: usize,
|
||||
intermediate_manager: Option<IntermediateManager>,
|
||||
}
|
||||
|
||||
impl Indexer {
|
||||
@@ -196,6 +197,7 @@ impl IndexerBuilder for IndexerBuilderImpl {
|
||||
indexer.inverted_indexer = self.build_inverted_indexer(file_id);
|
||||
indexer.fulltext_indexer = self.build_fulltext_indexer(file_id).await;
|
||||
indexer.bloom_filter_indexer = self.build_bloom_filter_indexer(file_id);
|
||||
indexer.intermediate_manager = Some(self.intermediate_manager.clone());
|
||||
if indexer.inverted_indexer.is_none()
|
||||
&& indexer.fulltext_indexer.is_none()
|
||||
&& indexer.bloom_filter_indexer.is_none()
|
||||
|
||||
@@ -21,6 +21,7 @@ impl Indexer {
|
||||
self.do_abort_inverted_index().await;
|
||||
self.do_abort_fulltext_index().await;
|
||||
self.do_abort_bloom_filter().await;
|
||||
self.do_prune_intm_sst_dir().await;
|
||||
self.puffin_manager = None;
|
||||
}
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@ impl Indexer {
|
||||
return IndexOutput::default();
|
||||
}
|
||||
|
||||
self.do_prune_intm_sst_dir().await;
|
||||
output.file_size = self.do_finish_puffin_writer(writer).await;
|
||||
output
|
||||
}
|
||||
@@ -270,4 +271,12 @@ impl Indexer {
|
||||
output.row_count = row_count;
|
||||
output.columns = column_ids;
|
||||
}
|
||||
|
||||
pub(crate) async fn do_prune_intm_sst_dir(&mut self) {
|
||||
if let Some(manager) = self.intermediate_manager.take() {
|
||||
if let Err(e) = manager.prune_sst_dir(&self.region_id, &self.file_id).await {
|
||||
warn!(e; "Failed to prune intermediate SST directory, region_id: {}, file_id: {}", self.region_id, self.file_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::io::ErrorKind;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use async_trait::async_trait;
|
||||
@@ -54,14 +55,28 @@ impl IntermediateManager {
|
||||
aux_path.as_ref()
|
||||
);
|
||||
|
||||
// Remove the intermediate directory on bankground
|
||||
let aux_pb = PathBuf::from(aux_path.as_ref());
|
||||
let intm_dir = aux_pb.join(INTERMEDIATE_DIR);
|
||||
let deleted_dir = intm_dir.with_extension(format!("deleted-{}", Uuid::new_v4()));
|
||||
match tokio::fs::rename(&intm_dir, &deleted_dir).await {
|
||||
Ok(_) => {
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = tokio::fs::remove_dir_all(deleted_dir).await {
|
||||
warn!(err; "Failed to remove intermediate directory");
|
||||
}
|
||||
});
|
||||
}
|
||||
Err(err) => {
|
||||
if err.kind() != ErrorKind::NotFound {
|
||||
warn!(err; "Failed to rename intermediate directory");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let store = new_fs_cache_store(&normalize_dir(aux_path.as_ref())).await?;
|
||||
let store = InstrumentedStore::new(store);
|
||||
|
||||
// Remove all garbage intermediate files from previous runs.
|
||||
if let Err(err) = store.remove_all(INTERMEDIATE_DIR).await {
|
||||
warn!(err; "Failed to remove garbage intermediate files");
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
base_dir: PathBuf::from(aux_path.as_ref()),
|
||||
store,
|
||||
@@ -94,6 +109,24 @@ impl IntermediateManager {
|
||||
.join(sst_file_id.to_string())
|
||||
.join(format!("fulltext-{column_id}-{uuid}"))
|
||||
}
|
||||
|
||||
/// Prunes the intermediate directory for SST files.
|
||||
pub(crate) async fn prune_sst_dir(
|
||||
&self,
|
||||
region_id: &RegionId,
|
||||
sst_file_id: &FileId,
|
||||
) -> Result<()> {
|
||||
let region_id = region_id.as_u64();
|
||||
let sst_dir = format!("{INTERMEDIATE_DIR}/{region_id}/{sst_file_id}/");
|
||||
self.store.remove_all(&sst_dir).await
|
||||
}
|
||||
|
||||
/// Prunes the intermediate directory for region files.
|
||||
pub(crate) async fn prune_region_dir(&self, region_id: &RegionId) -> Result<()> {
|
||||
let region_id = region_id.as_u64();
|
||||
let region_dir = format!("{INTERMEDIATE_DIR}/{region_id}/");
|
||||
self.store.remove_all(®ion_dir).await
|
||||
}
|
||||
}
|
||||
|
||||
/// `IntermediateLocation` produces paths for intermediate files
|
||||
@@ -268,6 +301,60 @@ mod tests {
|
||||
.unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cleanup_dir() {
|
||||
let temp_dir = temp_dir::create_temp_dir("test_cleanup_dir_");
|
||||
|
||||
let region_id = RegionId::new(0, 0);
|
||||
let sst_file_id = FileId::random();
|
||||
let region_dir = temp_dir
|
||||
.path()
|
||||
.join(INTERMEDIATE_DIR)
|
||||
.join(region_id.as_u64().to_string());
|
||||
let sst_dir = region_dir.join(sst_file_id.to_string());
|
||||
|
||||
let path = temp_dir.path().to_str().unwrap();
|
||||
let manager = IntermediateManager::init_fs(path).await.unwrap();
|
||||
|
||||
let location = IntermediateLocation::new(®ion_id, &sst_file_id);
|
||||
let temp_file_provider = TempFileProvider::new(location, manager.clone());
|
||||
|
||||
let mut f1 = temp_file_provider
|
||||
.create("sky", "000000000000")
|
||||
.await
|
||||
.unwrap();
|
||||
f1.write_all(b"hello").await.unwrap();
|
||||
f1.flush().await.unwrap();
|
||||
f1.close().await.unwrap();
|
||||
|
||||
let mut f2 = temp_file_provider
|
||||
.create("sky", "000000000001")
|
||||
.await
|
||||
.unwrap();
|
||||
f2.write_all(b"world").await.unwrap();
|
||||
f2.flush().await.unwrap();
|
||||
f2.close().await.unwrap();
|
||||
|
||||
temp_file_provider.cleanup().await.unwrap();
|
||||
|
||||
// sst_dir and region_dir still exists
|
||||
assert!(tokio::fs::try_exists(&sst_dir).await.unwrap());
|
||||
assert!(tokio::fs::try_exists(®ion_dir).await.unwrap());
|
||||
|
||||
// sst_dir should be deleted, region_dir still exists
|
||||
manager
|
||||
.prune_sst_dir(®ion_id, &sst_file_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(tokio::fs::try_exists(®ion_dir).await.unwrap());
|
||||
assert!(!tokio::fs::try_exists(&sst_dir).await.unwrap());
|
||||
|
||||
// sst_dir, region_dir should be deleted
|
||||
manager.prune_region_dir(®ion_id).await.unwrap();
|
||||
assert!(!tokio::fs::try_exists(&sst_dir).await.unwrap());
|
||||
assert!(!tokio::fs::try_exists(®ion_dir).await.unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intermediate_location() {
|
||||
let sst_file_id = FileId::random();
|
||||
|
||||
@@ -329,7 +329,9 @@ where
|
||||
.set_key_value_metadata(Some(vec![key_value_meta]))
|
||||
.set_compression(Compression::ZSTD(ZstdLevel::default()))
|
||||
.set_encoding(Encoding::PLAIN)
|
||||
.set_max_row_group_size(opts.row_group_size);
|
||||
.set_max_row_group_size(opts.row_group_size)
|
||||
.set_column_index_truncate_length(None)
|
||||
.set_statistics_truncate_length(None);
|
||||
|
||||
let props_builder = Self::customize_column_config(props_builder, &self.metadata);
|
||||
let writer_props = props_builder.build();
|
||||
|
||||
@@ -563,6 +563,7 @@ impl TestEnv {
|
||||
if let Some(metadata) = initial_metadata {
|
||||
RegionManifestManager::new(
|
||||
metadata,
|
||||
0,
|
||||
manifest_opts,
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
|
||||
@@ -116,6 +116,7 @@ impl SchedulerEnv {
|
||||
Arc::new(ManifestContext::new(
|
||||
RegionManifestManager::new(
|
||||
metadata,
|
||||
0,
|
||||
RegionManifestOptions {
|
||||
manifest_dir: "".to_string(),
|
||||
object_store: self.access_layer.object_store().clone(),
|
||||
|
||||
@@ -65,7 +65,12 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
|
||||
if region.provider.is_remote_wal() {
|
||||
let flushed_entry_id = region.version_control.current().last_entry_id;
|
||||
info!("Trying to replay memtable for region: {region_id}, flushed entry id: {flushed_entry_id}");
|
||||
let replay_from_entry_id = request
|
||||
.checkpoint
|
||||
.map(|c| c.entry_id)
|
||||
.unwrap_or_default()
|
||||
.max(flushed_entry_id);
|
||||
info!("Trying to replay memtable for region: {region_id}, provider: {:?}, replay from entry id: {replay_from_entry_id}, flushed entry id: {flushed_entry_id}", region.provider);
|
||||
let timer = Instant::now();
|
||||
let wal_entry_reader =
|
||||
self.wal
|
||||
@@ -75,15 +80,16 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
®ion.provider,
|
||||
wal_entry_reader,
|
||||
region_id,
|
||||
flushed_entry_id,
|
||||
replay_from_entry_id,
|
||||
®ion.version_control,
|
||||
self.config.allow_stale_entries,
|
||||
on_region_opened,
|
||||
)
|
||||
.await?;
|
||||
info!(
|
||||
"Elapsed: {:?}, region: {region_id} catchup finished. last entry id: {last_entry_id}, expected: {:?}.",
|
||||
"Elapsed: {:?}, region: {region_id}, provider: {:?} catchup finished. replay from entry id: {replay_from_entry_id}, flushed entry id: {flushed_entry_id}, last entry id: {last_entry_id}, expected: {:?}.",
|
||||
timer.elapsed(),
|
||||
region.provider,
|
||||
request.entry_id
|
||||
);
|
||||
if let Some(expected_last_entry_id) = request.entry_id {
|
||||
|
||||
@@ -99,6 +99,7 @@ where
|
||||
let object_store = region.access_layer.object_store().clone();
|
||||
let dropping_regions = self.dropping_regions.clone();
|
||||
let listener = self.listener.clone();
|
||||
let intm_manager = self.intermediate_manager.clone();
|
||||
common_runtime::spawn_global(async move {
|
||||
let gc_duration = listener
|
||||
.on_later_drop_begin(region_id)
|
||||
@@ -111,6 +112,9 @@ where
|
||||
gc_duration,
|
||||
)
|
||||
.await;
|
||||
if let Err(err) = intm_manager.prune_region_dir(®ion_id).await {
|
||||
warn!(err; "Failed to prune intermediate region directory, region_id: {}", region_id);
|
||||
}
|
||||
listener.on_later_drop_end(region_id, removed);
|
||||
});
|
||||
|
||||
|
||||
@@ -605,7 +605,13 @@ impl RangeManipulateStream {
|
||||
|
||||
// shorten the range to calculate
|
||||
let first_ts = ts_column.value(0);
|
||||
let first_ts_aligned = (first_ts / self.interval) * self.interval;
|
||||
// Preserve the query's alignment pattern when optimizing start time
|
||||
let remainder = (first_ts - self.start).rem_euclid(self.interval);
|
||||
let first_ts_aligned = if remainder == 0 {
|
||||
first_ts
|
||||
} else {
|
||||
first_ts + (self.interval - remainder)
|
||||
};
|
||||
let last_ts = ts_column.value(ts_column.len() - 1);
|
||||
let last_ts_aligned = ((last_ts + self.range) / self.interval) * self.interval;
|
||||
let start = self.start.max(first_ts_aligned);
|
||||
@@ -671,6 +677,7 @@ mod test {
|
||||
use datafusion::datasource::source::DataSourceExec;
|
||||
use datafusion::physical_expr::Partitioning;
|
||||
use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
|
||||
use datafusion::physical_plan::memory::MemoryStream;
|
||||
use datafusion::prelude::SessionContext;
|
||||
use datatypes::arrow::array::TimestampMillisecondArray;
|
||||
|
||||
@@ -832,4 +839,66 @@ mod test {
|
||||
}");
|
||||
do_normalize_test(1, 10_001, 3_000, 1_000, expected).await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_range_preserves_alignment() {
|
||||
// Test case: query starts at timestamp ending in 4000, step is 30s
|
||||
// Data starts at different alignment - should preserve query's 4000 pattern
|
||||
let schema = Arc::new(Schema::new(vec![Field::new(
|
||||
"timestamp",
|
||||
TimestampMillisecondType::DATA_TYPE,
|
||||
false,
|
||||
)]));
|
||||
let empty_stream = MemoryStream::try_new(vec![], schema.clone(), None).unwrap();
|
||||
|
||||
let stream = RangeManipulateStream {
|
||||
start: 1758093274000, // ends in 4000
|
||||
end: 1758093334000, // ends in 4000
|
||||
interval: 30000, // 30s step
|
||||
range: 60000, // 60s lookback
|
||||
time_index: 0,
|
||||
field_columns: vec![],
|
||||
aligned_ts_array: Arc::new(TimestampMillisecondArray::from(vec![0i64; 0])),
|
||||
output_schema: schema.clone(),
|
||||
input: Box::pin(empty_stream),
|
||||
metric: BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0),
|
||||
num_series: Count::new(),
|
||||
};
|
||||
|
||||
// Create test data with timestamps not aligned to query pattern
|
||||
let test_timestamps = vec![
|
||||
1758093260000, // ends in 0000 (different alignment)
|
||||
1758093290000, // ends in 0000
|
||||
1758093320000, // ends in 0000
|
||||
];
|
||||
let ts_array = TimestampMillisecondArray::from(test_timestamps);
|
||||
let test_schema = Arc::new(Schema::new(vec![Field::new(
|
||||
"timestamp",
|
||||
TimestampMillisecondType::DATA_TYPE,
|
||||
false,
|
||||
)]));
|
||||
let batch = RecordBatch::try_new(test_schema, vec![Arc::new(ts_array)]).unwrap();
|
||||
|
||||
let (ranges, (start, end)) = stream.calculate_range(&batch).unwrap();
|
||||
|
||||
// Verify the optimized start preserves query alignment (should end in 4000)
|
||||
assert_eq!(
|
||||
start % 30000,
|
||||
1758093274000 % 30000,
|
||||
"Optimized start should preserve query alignment pattern"
|
||||
);
|
||||
|
||||
// Verify we generate correct number of ranges for the alignment
|
||||
let expected_timestamps: Vec<i64> = (start..=end).step_by(30000).collect();
|
||||
assert_eq!(ranges.len(), expected_timestamps.len());
|
||||
|
||||
// Verify all generated timestamps maintain the same alignment pattern
|
||||
for ts in expected_timestamps {
|
||||
assert_eq!(
|
||||
ts % 30000,
|
||||
1758093274000 % 30000,
|
||||
"All timestamps should maintain query alignment pattern"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -369,6 +369,9 @@ impl<H> BoundedStager<H> {
|
||||
/// Note: It can't recover the mapping between puffin files and keys, so TTL
|
||||
/// is configured to purge the dangling files and directories.
|
||||
async fn recover(&self) -> Result<()> {
|
||||
let timer = std::time::Instant::now();
|
||||
info!("Recovering the staging area, base_dir: {:?}", self.base_dir);
|
||||
|
||||
let mut read_dir = fs::read_dir(&self.base_dir).await.context(ReadSnafu)?;
|
||||
|
||||
let mut elems = HashMap::new();
|
||||
@@ -430,6 +433,7 @@ impl<H> BoundedStager<H> {
|
||||
}
|
||||
|
||||
let mut size = 0;
|
||||
let num_elems = elems.len();
|
||||
for (key, value) in elems {
|
||||
size += value.size();
|
||||
self.cache.insert(key, value).await;
|
||||
@@ -440,6 +444,12 @@ impl<H> BoundedStager<H> {
|
||||
|
||||
self.cache.run_pending_tasks().await;
|
||||
|
||||
info!(
|
||||
"Recovered the staging area, num_entries: {}, num_bytes: {}, cost: {:?}",
|
||||
num_elems,
|
||||
size,
|
||||
timer.elapsed()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, BTreeSet, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::debug;
|
||||
@@ -32,10 +32,12 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
|
||||
use table::metadata::TableType;
|
||||
use table::table::adapter::DfTableProviderAdapter;
|
||||
|
||||
use crate::dist_plan::analyzer::utils::{aliased_columns_for, rewrite_merge_sort_exprs};
|
||||
use crate::dist_plan::commutativity::{
|
||||
partial_commutative_transformer, Categorizer, Commutativity,
|
||||
};
|
||||
use crate::dist_plan::merge_scan::MergeScanLogicalPlan;
|
||||
use crate::dist_plan::merge_sort::MergeSortLogicalPlan;
|
||||
use crate::metrics::PUSH_DOWN_FALLBACK_ERRORS_TOTAL;
|
||||
use crate::plan::ExtractExpr;
|
||||
use crate::query_engine::DefaultSerializer;
|
||||
@@ -46,7 +48,10 @@ mod test;
|
||||
mod fallback;
|
||||
mod utils;
|
||||
|
||||
pub(crate) use utils::{AliasMapping, AliasTracker};
|
||||
pub(crate) use utils::AliasMapping;
|
||||
|
||||
/// Placeholder for other physical partition columns that are not in logical table
|
||||
const OTHER_PHY_PART_COL_PLACEHOLDER: &str = "__OTHER_PHYSICAL_PART_COLS_PLACEHOLDER__";
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DistPlannerOptions {
|
||||
@@ -229,8 +234,7 @@ struct PlanRewriter {
|
||||
stage: Vec<LogicalPlan>,
|
||||
status: RewriterStatus,
|
||||
/// Partition columns of the table in current pass
|
||||
partition_cols: Option<Vec<String>>,
|
||||
alias_tracker: Option<AliasTracker>,
|
||||
partition_cols: Option<AliasMapping>,
|
||||
/// use stack count as scope to determine column requirements is needed or not
|
||||
/// i.e for a logical plan like:
|
||||
/// ```ignore
|
||||
@@ -288,7 +292,7 @@ impl PlanRewriter {
|
||||
}
|
||||
|
||||
/// Return true if should stop and expand. The input plan is the parent node of current node
|
||||
fn should_expand(&mut self, plan: &LogicalPlan) -> bool {
|
||||
fn should_expand(&mut self, plan: &LogicalPlan) -> DfResult<bool> {
|
||||
debug!(
|
||||
"Check should_expand at level: {} with Stack:\n{}, ",
|
||||
self.level,
|
||||
@@ -298,20 +302,21 @@ impl PlanRewriter {
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n"),
|
||||
);
|
||||
if DFLogicalSubstraitConvertor
|
||||
.encode(plan, DefaultSerializer)
|
||||
.is_err()
|
||||
{
|
||||
return true;
|
||||
if let Err(e) = DFLogicalSubstraitConvertor.encode(plan, DefaultSerializer) {
|
||||
debug!(
|
||||
"PlanRewriter: plan cannot be converted to substrait with error={e:?}, expanding now: {plan}"
|
||||
);
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
if self.expand_on_next_call {
|
||||
self.expand_on_next_call = false;
|
||||
return true;
|
||||
debug!("PlanRewriter: expand_on_next_call is true, expanding now");
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
if self.expand_on_next_part_cond_trans_commutative {
|
||||
let comm = Categorizer::check_plan(plan, self.get_aliased_partition_columns());
|
||||
let comm = Categorizer::check_plan(plan, self.partition_cols.clone())?;
|
||||
match comm {
|
||||
Commutativity::PartialCommutative => {
|
||||
// a small difference is that for partial commutative, we still need to
|
||||
@@ -327,13 +332,16 @@ impl PlanRewriter {
|
||||
// again a new node that can be push down, we should just
|
||||
// do push down now and avoid further expansion
|
||||
self.expand_on_next_part_cond_trans_commutative = false;
|
||||
return true;
|
||||
debug!(
|
||||
"PlanRewriter: meet a new conditional/transformed commutative plan, expanding now: {plan}"
|
||||
);
|
||||
return Ok(true);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
match Categorizer::check_plan(plan, self.get_aliased_partition_columns()) {
|
||||
match Categorizer::check_plan(plan, self.partition_cols.clone())? {
|
||||
Commutativity::Commutative => {}
|
||||
Commutativity::PartialCommutative => {
|
||||
if let Some(plan) = partial_commutative_transformer(plan) {
|
||||
@@ -354,9 +362,8 @@ impl PlanRewriter {
|
||||
}
|
||||
}
|
||||
Commutativity::TransformedCommutative { transformer } => {
|
||||
if let Some(transformer) = transformer
|
||||
&& let Some(transformer_actions) = transformer(plan)
|
||||
{
|
||||
if let Some(transformer) = transformer {
|
||||
let transformer_actions = transformer(plan)?;
|
||||
debug!(
|
||||
"PlanRewriter: transformed plan: {}\n from {plan}",
|
||||
transformer_actions
|
||||
@@ -387,11 +394,12 @@ impl PlanRewriter {
|
||||
Commutativity::NonCommutative
|
||||
| Commutativity::Unimplemented
|
||||
| Commutativity::Unsupported => {
|
||||
return true;
|
||||
debug!("PlanRewriter: meet a non-commutative plan, expanding now: {plan}");
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Update the column requirements for the current plan, plan_level is the level of the plan
|
||||
@@ -427,49 +435,31 @@ impl PlanRewriter {
|
||||
self.status = RewriterStatus::Unexpanded;
|
||||
}
|
||||
|
||||
/// Maybe update alias for original table columns in the plan
|
||||
fn maybe_update_alias(&mut self, node: &LogicalPlan) {
|
||||
if let Some(alias_tracker) = &mut self.alias_tracker {
|
||||
alias_tracker.update_alias(node);
|
||||
debug!(
|
||||
"Current partition columns are: {:?}",
|
||||
self.get_aliased_partition_columns()
|
||||
);
|
||||
} else if let LogicalPlan::TableScan(table_scan) = node {
|
||||
self.alias_tracker = AliasTracker::new(table_scan);
|
||||
debug!(
|
||||
"Initialize partition columns: {:?} with table={}",
|
||||
self.get_aliased_partition_columns(),
|
||||
table_scan.table_name
|
||||
);
|
||||
}
|
||||
}
|
||||
fn maybe_set_partitions(&mut self, plan: &LogicalPlan) -> DfResult<()> {
|
||||
if let Some(part_cols) = &mut self.partition_cols {
|
||||
// update partition alias
|
||||
let child = plan.inputs().first().cloned().ok_or_else(|| {
|
||||
datafusion_common::DataFusionError::Internal(format!(
|
||||
"PlanRewriter: maybe_set_partitions: plan has no child: {plan}"
|
||||
))
|
||||
})?;
|
||||
|
||||
fn get_aliased_partition_columns(&self) -> Option<AliasMapping> {
|
||||
if let Some(part_cols) = self.partition_cols.as_ref() {
|
||||
let Some(alias_tracker) = &self.alias_tracker else {
|
||||
// no alias tracker meaning no table scan encountered
|
||||
return None;
|
||||
};
|
||||
let mut aliased = HashMap::new();
|
||||
for part_col in part_cols {
|
||||
let all_alias = alias_tracker
|
||||
.get_all_alias_for_col(part_col)
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
aliased.insert(part_col.clone(), all_alias);
|
||||
for (_col_name, alias_set) in part_cols.iter_mut() {
|
||||
let aliased_cols = aliased_columns_for(
|
||||
&alias_set.clone().into_iter().collect(),
|
||||
plan,
|
||||
Some(child),
|
||||
)?;
|
||||
*alias_set = aliased_cols.into_values().flatten().collect();
|
||||
}
|
||||
Some(aliased)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn maybe_set_partitions(&mut self, plan: &LogicalPlan) {
|
||||
if self.partition_cols.is_some() {
|
||||
// only need to set once
|
||||
return;
|
||||
debug!(
|
||||
"PlanRewriter: maybe_set_partitions: updated partition columns: {:?} at plan: {}",
|
||||
part_cols,
|
||||
plan.display()
|
||||
);
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if let LogicalPlan::TableScan(table_scan) = plan {
|
||||
@@ -506,14 +496,39 @@ impl PlanRewriter {
|
||||
// as subset of phy part cols can still be used for certain optimization, and it works as if
|
||||
// those columns are always null
|
||||
// This helps with distinguishing between non-partitioned table and partitioned table with all phy part cols not in logical table
|
||||
partition_cols
|
||||
.push("__OTHER_PHYSICAL_PART_COLS_PLACEHOLDER__".to_string());
|
||||
partition_cols.push(OTHER_PHY_PART_COL_PLACEHOLDER.to_string());
|
||||
}
|
||||
self.partition_cols = Some(partition_cols);
|
||||
self.partition_cols = Some(
|
||||
partition_cols
|
||||
.into_iter()
|
||||
.map(|c| {
|
||||
if c == OTHER_PHY_PART_COL_PLACEHOLDER {
|
||||
// for placeholder, just return a empty alias
|
||||
return Ok((c.clone(), BTreeSet::new()));
|
||||
}
|
||||
let index =
|
||||
plan.schema().index_of_column_by_name(None, &c).ok_or_else(|| {
|
||||
datafusion_common::DataFusionError::Internal(
|
||||
format!(
|
||||
"PlanRewriter: maybe_set_partitions: column {c} not found in schema of plan: {plan}"
|
||||
),
|
||||
)
|
||||
})?;
|
||||
let column = plan.schema().columns().get(index).cloned().ok_or_else(|| {
|
||||
datafusion_common::DataFusionError::Internal(format!(
|
||||
"PlanRewriter: maybe_set_partitions: column index {index} out of bounds in schema of plan: {plan}"
|
||||
))
|
||||
})?;
|
||||
Ok((c.clone(), BTreeSet::from([column])))
|
||||
})
|
||||
.collect::<DfResult<AliasMapping>>()?,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// pop one stack item and reduce the level by 1
|
||||
@@ -539,9 +554,14 @@ impl PlanRewriter {
|
||||
"PlanRewriter: after enforced column requirements with rewriter: {rewriter:?} for node:\n{on_node}"
|
||||
);
|
||||
|
||||
debug!(
|
||||
"PlanRewriter: expand on node: {on_node} with partition col alias mapping: {:?}",
|
||||
self.partition_cols
|
||||
);
|
||||
|
||||
// add merge scan as the new root
|
||||
let mut node = MergeScanLogicalPlan::new(
|
||||
on_node,
|
||||
on_node.clone(),
|
||||
false,
|
||||
// at this stage, the partition cols should be set
|
||||
// treat it as non-partitioned if None
|
||||
@@ -551,6 +571,15 @@ impl PlanRewriter {
|
||||
|
||||
// expand stages
|
||||
for new_stage in self.stage.drain(..) {
|
||||
// tracking alias for merge sort's sort exprs
|
||||
let new_stage = if let LogicalPlan::Extension(ext) = &new_stage
|
||||
&& let Some(merge_sort) = ext.node.as_any().downcast_ref::<MergeSortLogicalPlan>()
|
||||
{
|
||||
// TODO(discord9): change `on_node` to `node` once alias tracking is supported for merge scan
|
||||
rewrite_merge_sort_exprs(merge_sort, &on_node)?
|
||||
} else {
|
||||
new_stage
|
||||
};
|
||||
node = new_stage
|
||||
.with_new_exprs(new_stage.expressions_consider_join(), vec![node.clone()])?;
|
||||
}
|
||||
@@ -592,6 +621,7 @@ struct EnforceDistRequirementRewriter {
|
||||
/// when on `Projection` node, we don't need to apply the column requirements of `Aggregate` node
|
||||
/// because the `Projection` node is not in the scope of the `Aggregate` node
|
||||
cur_level: usize,
|
||||
plan_per_level: BTreeMap<usize, LogicalPlan>,
|
||||
}
|
||||
|
||||
impl EnforceDistRequirementRewriter {
|
||||
@@ -599,8 +629,67 @@ impl EnforceDistRequirementRewriter {
|
||||
Self {
|
||||
column_requirements,
|
||||
cur_level,
|
||||
plan_per_level: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a mapping from (original column, level) to aliased columns in current node of all
|
||||
/// applicable column requirements
|
||||
/// i.e. only column requirements with level >= `cur_level` will be considered
|
||||
fn get_current_applicable_column_requirements(
|
||||
&self,
|
||||
node: &LogicalPlan,
|
||||
) -> DfResult<BTreeMap<(Column, usize), BTreeSet<Column>>> {
|
||||
let col_req_per_level = self
|
||||
.column_requirements
|
||||
.iter()
|
||||
.filter(|(_, level)| *level >= self.cur_level)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// track alias for columns and use aliased columns instead
|
||||
// aliased col reqs at current level
|
||||
let mut result_alias_mapping = BTreeMap::new();
|
||||
let Some(child) = node.inputs().first().cloned() else {
|
||||
return Ok(Default::default());
|
||||
};
|
||||
for (col_req, level) in col_req_per_level {
|
||||
if let Some(original) = self.plan_per_level.get(level) {
|
||||
// query for alias in current plan
|
||||
let aliased_cols =
|
||||
aliased_columns_for(&col_req.iter().cloned().collect(), node, Some(original))?;
|
||||
for original_col in col_req {
|
||||
let aliased_cols = aliased_cols.get(original_col).cloned();
|
||||
if let Some(cols) = aliased_cols
|
||||
&& !cols.is_empty()
|
||||
{
|
||||
result_alias_mapping.insert((original_col.clone(), *level), cols);
|
||||
} else {
|
||||
// if no aliased column found in current node, there should be alias in child node as promised by enforce col reqs
|
||||
// because it should insert required columns in child node
|
||||
// so we can find the alias in child node
|
||||
// if not found, it's an internal error
|
||||
let aliases_in_child = aliased_columns_for(
|
||||
&[original_col.clone()].into(),
|
||||
child,
|
||||
Some(original),
|
||||
)?;
|
||||
let Some(aliases) = aliases_in_child
|
||||
.get(original_col)
|
||||
.cloned()
|
||||
.filter(|a| !a.is_empty())
|
||||
else {
|
||||
return Err(datafusion_common::DataFusionError::Internal(format!(
|
||||
"EnforceDistRequirementRewriter: no alias found for required column {original_col} in child plan {child} from original plan {original}",
|
||||
)));
|
||||
};
|
||||
|
||||
result_alias_mapping.insert((original_col.clone(), *level), aliases);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result_alias_mapping)
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for EnforceDistRequirementRewriter {
|
||||
@@ -614,6 +703,7 @@ impl TreeNodeRewriter for EnforceDistRequirementRewriter {
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
self.plan_per_level.insert(self.cur_level, node.clone());
|
||||
self.cur_level += 1;
|
||||
Ok(Transformed::no(node))
|
||||
}
|
||||
@@ -621,38 +711,41 @@ impl TreeNodeRewriter for EnforceDistRequirementRewriter {
|
||||
fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
|
||||
self.cur_level -= 1;
|
||||
// first get all applicable column requirements
|
||||
let mut applicable_column_requirements = self
|
||||
.column_requirements
|
||||
.iter()
|
||||
.filter(|(_, level)| *level >= self.cur_level)
|
||||
.map(|(cols, _)| cols.clone())
|
||||
.reduce(|mut acc, cols| {
|
||||
acc.extend(cols);
|
||||
acc
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
debug!(
|
||||
"EnforceDistRequirementRewriter: applicable column requirements at level {} = {:?} for node {}",
|
||||
self.cur_level,
|
||||
applicable_column_requirements,
|
||||
node.display()
|
||||
);
|
||||
|
||||
// make sure all projection applicable scope has the required columns
|
||||
if let LogicalPlan::Projection(ref projection) = node {
|
||||
let mut applicable_column_requirements =
|
||||
self.get_current_applicable_column_requirements(&node)?;
|
||||
|
||||
debug!(
|
||||
"EnforceDistRequirementRewriter: applicable column requirements at level {} = {:?} for node {}",
|
||||
self.cur_level,
|
||||
applicable_column_requirements,
|
||||
node.display()
|
||||
);
|
||||
|
||||
for expr in &projection.expr {
|
||||
let (qualifier, name) = expr.qualified_name();
|
||||
let column = Column::new(qualifier, name);
|
||||
applicable_column_requirements.remove(&column);
|
||||
applicable_column_requirements.retain(|_col_level, alias_set| {
|
||||
// remove all columns that are already in the projection exprs
|
||||
!alias_set.contains(&column)
|
||||
});
|
||||
}
|
||||
if applicable_column_requirements.is_empty() {
|
||||
return Ok(Transformed::no(node));
|
||||
}
|
||||
|
||||
let mut new_exprs = projection.expr.clone();
|
||||
for col in &applicable_column_requirements {
|
||||
new_exprs.push(Expr::Column(col.clone()));
|
||||
for (col, alias_set) in &applicable_column_requirements {
|
||||
// use the first alias in alias set as the column to add
|
||||
new_exprs.push(Expr::Column(alias_set.first().cloned().ok_or_else(
|
||||
|| {
|
||||
datafusion_common::DataFusionError::Internal(
|
||||
format!("EnforceDistRequirementRewriter: alias set is empty, for column {col:?} in node {node}"),
|
||||
)
|
||||
},
|
||||
)?));
|
||||
}
|
||||
let new_node =
|
||||
node.with_new_exprs(new_exprs, node.inputs().into_iter().cloned().collect())?;
|
||||
@@ -661,6 +754,9 @@ impl TreeNodeRewriter for EnforceDistRequirementRewriter {
|
||||
applicable_column_requirements
|
||||
);
|
||||
|
||||
// update plan for later use
|
||||
self.plan_per_level.insert(self.cur_level, new_node.clone());
|
||||
|
||||
// still need to continue for next projection if applicable
|
||||
return Ok(Transformed::yes(new_node));
|
||||
}
|
||||
@@ -679,7 +775,6 @@ impl TreeNodeRewriter for PlanRewriter {
|
||||
self.stage.clear();
|
||||
self.set_unexpanded();
|
||||
self.partition_cols = None;
|
||||
self.alias_tracker = None;
|
||||
Ok(Transformed::no(node))
|
||||
}
|
||||
|
||||
@@ -700,9 +795,7 @@ impl TreeNodeRewriter for PlanRewriter {
|
||||
return Ok(Transformed::no(node));
|
||||
}
|
||||
|
||||
self.maybe_set_partitions(&node);
|
||||
|
||||
self.maybe_update_alias(&node);
|
||||
self.maybe_set_partitions(&node)?;
|
||||
|
||||
let Some(parent) = self.get_parent() else {
|
||||
debug!("Plan Rewriter: expand now for no parent found for node: {node}");
|
||||
@@ -721,8 +814,7 @@ impl TreeNodeRewriter for PlanRewriter {
|
||||
|
||||
let parent = parent.clone();
|
||||
|
||||
// TODO(ruihang): avoid this clone
|
||||
if self.should_expand(&parent) {
|
||||
if self.should_expand(&parent)? {
|
||||
// TODO(ruihang): does this work for nodes with multiple children?;
|
||||
debug!(
|
||||
"PlanRewriter: should expand child:\n {node}\n Of Parent: {}",
|
||||
|
||||
@@ -17,13 +17,17 @@
|
||||
//! This is a temporary solution, and will be removed once we have a more robust plan rewriter
|
||||
//!
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use common_telemetry::debug;
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
|
||||
use datafusion_common::Result as DfResult;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use table::metadata::TableType;
|
||||
use table::table::adapter::DfTableProviderAdapter;
|
||||
|
||||
use crate::dist_plan::analyzer::{AliasMapping, OTHER_PHY_PART_COL_PLACEHOLDER};
|
||||
use crate::dist_plan::MergeScanLogicalPlan;
|
||||
|
||||
/// FallbackPlanRewriter is a plan rewriter that will only push down table scan node
|
||||
@@ -38,9 +42,9 @@ impl TreeNodeRewriter for FallbackPlanRewriter {
|
||||
|
||||
fn f_down(
|
||||
&mut self,
|
||||
node: Self::Node,
|
||||
) -> datafusion_common::Result<datafusion_common::tree_node::Transformed<Self::Node>> {
|
||||
if let LogicalPlan::TableScan(table_scan) = &node {
|
||||
plan: Self::Node,
|
||||
) -> DfResult<datafusion_common::tree_node::Transformed<Self::Node>> {
|
||||
if let LogicalPlan::TableScan(table_scan) = &plan {
|
||||
let partition_cols = if let Some(source) = table_scan
|
||||
.source
|
||||
.as_any()
|
||||
@@ -63,7 +67,29 @@ impl TreeNodeRewriter for FallbackPlanRewriter {
|
||||
"FallbackPlanRewriter: table {} has partition columns: {:?}",
|
||||
info.name, partition_cols
|
||||
);
|
||||
Some(partition_cols)
|
||||
Some(partition_cols
|
||||
.into_iter()
|
||||
.map(|c| {
|
||||
if c == OTHER_PHY_PART_COL_PLACEHOLDER {
|
||||
// for placeholder, just return a empty alias
|
||||
return Ok((c.clone(), BTreeSet::new()));
|
||||
}
|
||||
let index =
|
||||
plan.schema().index_of_column_by_name(None, &c).ok_or_else(|| {
|
||||
datafusion_common::DataFusionError::Internal(
|
||||
format!(
|
||||
"PlanRewriter: maybe_set_partitions: column {c} not found in schema of plan: {plan}"
|
||||
),
|
||||
)
|
||||
})?;
|
||||
let column = plan.schema().columns().get(index).cloned().ok_or_else(|| {
|
||||
datafusion_common::DataFusionError::Internal(format!(
|
||||
"PlanRewriter: maybe_set_partitions: column index {index} out of bounds in schema of plan: {plan}"
|
||||
))
|
||||
})?;
|
||||
Ok((c.clone(), BTreeSet::from([column])))
|
||||
})
|
||||
.collect::<DfResult<AliasMapping>>()?)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -74,7 +100,7 @@ impl TreeNodeRewriter for FallbackPlanRewriter {
|
||||
None
|
||||
};
|
||||
let node = MergeScanLogicalPlan::new(
|
||||
node,
|
||||
plan,
|
||||
false,
|
||||
// at this stage, the partition cols should be set
|
||||
// treat it as non-partitioned if None
|
||||
@@ -83,7 +109,7 @@ impl TreeNodeRewriter for FallbackPlanRewriter {
|
||||
.into_logical_plan();
|
||||
Ok(Transformed::yes(node))
|
||||
} else {
|
||||
Ok(Transformed::no(node))
|
||||
Ok(Transformed::no(plan))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,17 +15,23 @@
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::datatypes::IntervalDayTime;
|
||||
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
|
||||
use common_error::ext::BoxedError;
|
||||
use common_function::aggrs::aggr_wrapper::{StateMergeHelper, StateWrapper};
|
||||
use common_recordbatch::adapter::RecordBatchMetrics;
|
||||
use common_recordbatch::error::Result as RecordBatchResult;
|
||||
use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use datafusion::execution::SessionState;
|
||||
use datafusion::functions_aggregate::expr_fn::avg;
|
||||
use datafusion::functions_aggregate::min_max::{max, min};
|
||||
use datafusion::prelude::SessionContext;
|
||||
use datafusion_common::JoinType;
|
||||
use datafusion_expr::{col, lit, Expr, LogicalPlanBuilder};
|
||||
use datafusion_expr::expr::ScalarFunction;
|
||||
use datafusion_expr::{col, lit, AggregateUDF, Expr, LogicalPlanBuilder};
|
||||
use datafusion_functions::datetime::date_bin;
|
||||
use datafusion_sql::TableReference;
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnSchema, SchemaBuilder, SchemaRef};
|
||||
@@ -152,11 +158,30 @@ impl Stream for EmptyStream {
|
||||
}
|
||||
}
|
||||
|
||||
fn try_encode_decode_substrait(plan: &LogicalPlan, state: SessionState) {
|
||||
let sub_plan_bytes = substrait::DFLogicalSubstraitConvertor
|
||||
.encode(plan, crate::query_engine::DefaultSerializer)
|
||||
.unwrap();
|
||||
let inner = sub_plan_bytes.clone();
|
||||
let decoded_plan = futures::executor::block_on(async move {
|
||||
substrait::DFLogicalSubstraitConvertor
|
||||
.decode(inner, state)
|
||||
.await
|
||||
}).inspect_err(|e|{
|
||||
use prost::Message;
|
||||
let sub_plan = substrait::substrait_proto_df::proto::Plan::decode(sub_plan_bytes).unwrap();
|
||||
common_telemetry::error!("Failed to decode substrait plan: {e},substrait plan: {sub_plan:#?}\nlogical plan: {plan:#?}");
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(*plan, decoded_plan);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_proj_sort_proj() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -199,11 +224,58 @@ fn expand_proj_sort_proj() {
|
||||
assert_eq!(expected, result.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_proj_sort_partial_proj() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
|
||||
.unwrap()
|
||||
.project(vec![col("number"), col("pk1"), col("pk2"), col("pk3")])
|
||||
.unwrap()
|
||||
.project(vec![
|
||||
col("number"),
|
||||
col("pk1"),
|
||||
col("pk3"),
|
||||
col("pk1").eq(col("pk2")),
|
||||
])
|
||||
.unwrap()
|
||||
.sort(vec![col("t.pk1 = t.pk2").sort(true, true)])
|
||||
.unwrap()
|
||||
.project(vec![col("number"), col("t.pk1 = t.pk2").alias("eq_sorted")])
|
||||
.unwrap()
|
||||
.project(vec![col("number")])
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let config = ConfigOptions::default();
|
||||
let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
|
||||
|
||||
let expected = [
|
||||
"Projection: t.number",
|
||||
" MergeSort: eq_sorted ASC NULLS FIRST", // notice how `eq_sorted` is used here
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"Projection: t.number, eq_sorted", // notice how `eq_sorted` is added not `t.pk1 = t.pk2`
|
||||
" Projection: t.number, t.pk1 = t.pk2 AS eq_sorted",
|
||||
" Sort: t.pk1 = t.pk2 ASC NULLS FIRST",
|
||||
" Projection: t.number, t.pk1, t.pk3, t.pk1 = t.pk2",
|
||||
" Projection: t.number, t.pk1, t.pk2, t.pk3", // notice this projection doesn't add `t.pk1 = t.pk2` column requirement
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
.join("\n");
|
||||
assert_eq!(expected, result.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_sort_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -233,11 +305,13 @@ fn expand_sort_limit() {
|
||||
assert_eq!(expected, result.to_string());
|
||||
}
|
||||
|
||||
/// Test merge sort can apply enforce dist requirement columns correctly and use the aliased column correctly, as there is
|
||||
/// a aliased sort column, there is no need to add a duplicate sort column using it's original column name
|
||||
#[test]
|
||||
fn expand_sort_alias_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -258,10 +332,10 @@ fn expand_sort_alias_limit() {
|
||||
let expected = [
|
||||
"Projection: something",
|
||||
" Limit: skip=0, fetch=10",
|
||||
" MergeSort: t.pk1 ASC NULLS LAST",
|
||||
" MergeSort: something ASC NULLS LAST",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"Limit: skip=0, fetch=10",
|
||||
" Projection: t.pk1 AS something, t.pk1",
|
||||
" Projection: t.pk1 AS something",
|
||||
" Sort: t.pk1 ASC NULLS LAST",
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
@@ -276,7 +350,7 @@ fn expand_sort_alias_limit() {
|
||||
fn expand_sort_alias_conflict_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -318,7 +392,7 @@ fn expand_sort_alias_conflict_limit() {
|
||||
fn expand_sort_alias_conflict_but_not_really_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -358,7 +432,7 @@ fn expand_sort_alias_conflict_but_not_really_limit() {
|
||||
fn expand_limit_sort() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -391,7 +465,7 @@ fn expand_limit_sort() {
|
||||
fn expand_sort_limit_sort() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -438,7 +512,7 @@ fn expand_sort_limit_sort() {
|
||||
fn expand_proj_step_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -473,7 +547,7 @@ fn expand_proj_step_aggr() {
|
||||
fn expand_proj_alias_fake_part_col_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -517,7 +591,7 @@ fn expand_proj_alias_fake_part_col_aggr() {
|
||||
fn expand_proj_alias_aliased_part_col_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -563,7 +637,7 @@ fn expand_proj_alias_aliased_part_col_aggr() {
|
||||
fn expand_part_col_aggr_step_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -596,7 +670,7 @@ fn expand_part_col_aggr_step_aggr() {
|
||||
fn expand_step_aggr_step_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -629,7 +703,7 @@ fn expand_step_aggr_step_aggr() {
|
||||
fn expand_part_col_aggr_part_col_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -673,7 +747,7 @@ fn expand_part_col_aggr_part_col_aggr() {
|
||||
fn expand_step_aggr_proj() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -709,7 +783,7 @@ fn expand_step_aggr_proj() {
|
||||
fn expand_proj_sort_step_aggr_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -750,7 +824,7 @@ fn expand_proj_sort_step_aggr_limit() {
|
||||
fn expand_proj_sort_limit_step_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -792,7 +866,7 @@ fn expand_proj_sort_limit_step_aggr() {
|
||||
fn expand_proj_limit_step_aggr_sort() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -833,7 +907,7 @@ fn expand_proj_limit_step_aggr_sort() {
|
||||
fn expand_proj_sort_part_col_aggr_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -875,7 +949,7 @@ fn expand_proj_sort_part_col_aggr_limit() {
|
||||
fn expand_proj_sort_limit_part_col_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -917,7 +991,7 @@ fn expand_proj_sort_limit_part_col_aggr() {
|
||||
fn expand_proj_part_col_aggr_limit_sort() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -959,7 +1033,7 @@ fn expand_proj_part_col_aggr_limit_sort() {
|
||||
fn expand_proj_part_col_aggr_sort_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -1002,7 +1076,7 @@ fn expand_proj_part_col_aggr_sort_limit() {
|
||||
fn expand_proj_limit_part_col_aggr_sort() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -1044,7 +1118,7 @@ fn expand_proj_limit_part_col_aggr_sort() {
|
||||
fn expand_proj_limit_sort_part_col_aggr() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -1087,7 +1161,7 @@ fn expand_proj_limit_sort_part_col_aggr() {
|
||||
fn expand_step_aggr_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -1120,7 +1194,7 @@ fn expand_step_aggr_limit() {
|
||||
fn expand_step_aggr_avg_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -1153,7 +1227,7 @@ fn expand_step_aggr_avg_limit() {
|
||||
fn expand_part_col_aggr_limit() {
|
||||
// use logging for better debugging
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
@@ -1332,10 +1406,224 @@ fn transform_unalighed_join_with_alias() {
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"TableScan: t",
|
||||
"]]",
|
||||
" SubqueryAlias: right",
|
||||
" Projection: t.number",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"TableScan: t",
|
||||
" Projection: right.number",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"SubqueryAlias: right",
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
.join("\n");
|
||||
assert_eq!(expected, result.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transform_subquery_sort_alias() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
|
||||
let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
|
||||
.unwrap()
|
||||
.alias("a")
|
||||
.unwrap()
|
||||
.sort(vec![col("a.number").sort(true, false)])
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap();
|
||||
let config = ConfigOptions::default();
|
||||
let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
|
||||
let expected = [
|
||||
"Projection: a.pk1, a.pk2, a.pk3, a.ts, a.number",
|
||||
" MergeSort: a.number ASC NULLS LAST",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"Sort: a.number ASC NULLS LAST",
|
||||
" SubqueryAlias: a",
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
.join("\n");
|
||||
assert_eq!(expected, result.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transform_sort_subquery_alias() {
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "numbers".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
|
||||
let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
|
||||
.unwrap()
|
||||
.sort(vec![col("t.number").sort(true, false)])
|
||||
.unwrap()
|
||||
.alias("a")
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap();
|
||||
let config = ConfigOptions::default();
|
||||
let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
|
||||
let expected = [
|
||||
"Projection: a.pk1, a.pk2, a.pk3, a.ts, a.number",
|
||||
" MergeSort: a.number ASC NULLS LAST",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"SubqueryAlias: a",
|
||||
" Sort: t.number ASC NULLS LAST",
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
.join("\n");
|
||||
assert_eq!(expected, result.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn date_bin_ts_group_by() {
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_source = Arc::new(DefaultTableSource::new(Arc::new(
|
||||
DfTableProviderAdapter::new(test_table),
|
||||
)));
|
||||
let date_bin_call = Expr::ScalarFunction(ScalarFunction::new_udf(
|
||||
date_bin(),
|
||||
vec![
|
||||
lit(datafusion_common::ScalarValue::IntervalDayTime(Some(
|
||||
IntervalDayTime::new(0, 60 * 1000), // 1 minute in millis
|
||||
))),
|
||||
col("ts"),
|
||||
],
|
||||
));
|
||||
let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
|
||||
.unwrap()
|
||||
.aggregate(vec![date_bin_call], vec![min(col("number"))])
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let config = ConfigOptions::default();
|
||||
let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
|
||||
|
||||
let expected = [
|
||||
r#"Projection: date_bin(IntervalDayTime("IntervalDayTime { days: 0, milliseconds: 60000 }"),t.ts), min(t.number)"#,
|
||||
r#" Aggregate: groupBy=[[date_bin(IntervalDayTime("IntervalDayTime { days: 0, milliseconds: 60000 }"),t.ts)]], aggr=[[__min_merge(__min_state(t.number)) AS min(t.number)]]"#,
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
r#"Aggregate: groupBy=[[date_bin(IntervalDayTime("IntervalDayTime { days: 0, milliseconds: 60000 }"), t.ts)]], aggr=[[__min_state(t.number)]]"#,
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
.join("\n");
|
||||
assert_eq!(expected, result.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_last_value_order_by() {
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_provider = Arc::new(DfTableProviderAdapter::new(test_table));
|
||||
let table_source = Arc::new(DefaultTableSource::new(table_provider.clone() as _));
|
||||
let ctx = SessionContext::new();
|
||||
ctx.register_table(TableReference::bare("t"), table_provider.clone() as _)
|
||||
.unwrap();
|
||||
ctx.register_udaf(AggregateUDF::new_from_impl(
|
||||
StateWrapper::new(
|
||||
datafusion::functions_aggregate::first_last::last_value_udaf()
|
||||
.as_ref()
|
||||
.clone(),
|
||||
)
|
||||
.unwrap(),
|
||||
));
|
||||
|
||||
let plan = LogicalPlanBuilder::scan_with_filters("t", table_source.clone(), None, vec![])
|
||||
.unwrap()
|
||||
.aggregate(
|
||||
Vec::<Expr>::new(),
|
||||
vec![datafusion::functions_aggregate::first_last::last_value(
|
||||
col("ts"),
|
||||
vec![col("ts").sort(true, true)],
|
||||
)],
|
||||
)
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
try_encode_decode_substrait(&plan, ctx.state());
|
||||
|
||||
let config = ConfigOptions::default();
|
||||
let result = DistPlannerAnalyzer {}
|
||||
.analyze(plan.clone(), &config)
|
||||
.unwrap();
|
||||
|
||||
let expected = [
|
||||
"Projection: last_value(t.ts) ORDER BY [t.ts ASC NULLS FIRST]",
|
||||
" Aggregate: groupBy=[[]], aggr=[[__last_value_merge(__last_value_state(t.ts) ORDER BY [t.ts ASC NULLS FIRST]) AS last_value(t.ts) ORDER BY [t.ts ASC NULLS FIRST]]]",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"Aggregate: groupBy=[[]], aggr=[[__last_value_state(t.ts) ORDER BY [t.ts ASC NULLS FIRST]]]",
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
.join("\n");
|
||||
assert_eq!(expected, result.to_string());
|
||||
|
||||
let LogicalPlan::Aggregate(aggr_plan) = plan else {
|
||||
panic!("expect Aggregate plan");
|
||||
};
|
||||
let split = StateMergeHelper::split_aggr_node(aggr_plan).unwrap();
|
||||
|
||||
try_encode_decode_substrait(&split.lower_state, ctx.state());
|
||||
}
|
||||
|
||||
/// try remove the order by to see if it still works
|
||||
#[test]
|
||||
fn test_last_value_no_order_by() {
|
||||
init_default_ut_logging();
|
||||
let test_table = TestTable::table_with_name(0, "t".to_string());
|
||||
let table_provider = Arc::new(DfTableProviderAdapter::new(test_table));
|
||||
let table_source = Arc::new(DefaultTableSource::new(table_provider.clone() as _));
|
||||
let ctx = SessionContext::new();
|
||||
ctx.register_table(TableReference::bare("t"), table_provider.clone() as _)
|
||||
.unwrap();
|
||||
ctx.register_udaf(AggregateUDF::new_from_impl(
|
||||
StateWrapper::new(
|
||||
datafusion::functions_aggregate::first_last::last_value_udaf()
|
||||
.as_ref()
|
||||
.clone(),
|
||||
)
|
||||
.unwrap(),
|
||||
));
|
||||
|
||||
let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
|
||||
.unwrap()
|
||||
.aggregate(
|
||||
Vec::<Expr>::new(),
|
||||
vec![datafusion::functions_aggregate::first_last::last_value(
|
||||
col("ts"),
|
||||
vec![],
|
||||
)],
|
||||
)
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let LogicalPlan::Aggregate(aggr_plan) = plan.clone() else {
|
||||
panic!("expect Aggregate plan");
|
||||
};
|
||||
let split = StateMergeHelper::split_aggr_node(aggr_plan).unwrap();
|
||||
|
||||
try_encode_decode_substrait(&split.lower_state, ctx.state());
|
||||
|
||||
let config = ConfigOptions::default();
|
||||
let result = DistPlannerAnalyzer {}
|
||||
.analyze(plan.clone(), &config)
|
||||
.unwrap();
|
||||
|
||||
let expected = [
|
||||
"Projection: last_value(t.ts)",
|
||||
" Aggregate: groupBy=[[]], aggr=[[__last_value_merge(__last_value_state(t.ts)) AS last_value(t.ts)]]",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
"Aggregate: groupBy=[[]], aggr=[[__last_value_state(t.ts)]]",
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
.join("\n");
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -15,9 +15,9 @@
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_function::aggrs::aggr_wrapper::{aggr_state_func_name, StateMergeHelper};
|
||||
use common_function::function_registry::FUNCTION_REGISTRY;
|
||||
use common_function::aggrs::aggr_wrapper::{is_all_aggr_exprs_steppable, StateMergeHelper};
|
||||
use common_telemetry::debug;
|
||||
use datafusion::error::Result as DfResult;
|
||||
use datafusion_expr::{Expr, LogicalPlan, UserDefinedLogicalNode};
|
||||
use promql::extension_plan::{
|
||||
EmptyMetric, InstantManipulate, RangeManipulate, SeriesDivide, SeriesNormalize,
|
||||
@@ -71,38 +71,6 @@ pub fn step_aggr_to_upper_aggr(
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
/// Check if the given aggregate expression is steppable.
|
||||
/// As in if it can be split into multiple steps:
|
||||
/// i.e. on datanode first call `state(input)` then
|
||||
/// on frontend call `calc(merge(state))` to get the final result.
|
||||
pub fn is_all_aggr_exprs_steppable(aggr_exprs: &[Expr]) -> bool {
|
||||
aggr_exprs.iter().all(|expr| {
|
||||
if let Some(aggr_func) = get_aggr_func(expr) {
|
||||
if aggr_func.params.distinct {
|
||||
// Distinct aggregate functions are not steppable(yet).
|
||||
return false;
|
||||
}
|
||||
|
||||
// whether the corresponding state function exists in the registry
|
||||
FUNCTION_REGISTRY.is_aggr_func_exist(&aggr_state_func_name(aggr_func.func.name()))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_aggr_func(expr: &Expr) -> Option<&datafusion_expr::expr::AggregateFunction> {
|
||||
let mut expr_ref = expr;
|
||||
while let Expr::Alias(alias) = expr_ref {
|
||||
expr_ref = &alias.expr;
|
||||
}
|
||||
if let Expr::AggregateFunction(aggr_func) = expr_ref {
|
||||
Some(aggr_func)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub enum Commutativity {
|
||||
Commutative,
|
||||
@@ -121,15 +89,18 @@ pub enum Commutativity {
|
||||
pub struct Categorizer {}
|
||||
|
||||
impl Categorizer {
|
||||
pub fn check_plan(plan: &LogicalPlan, partition_cols: Option<AliasMapping>) -> Commutativity {
|
||||
pub fn check_plan(
|
||||
plan: &LogicalPlan,
|
||||
partition_cols: Option<AliasMapping>,
|
||||
) -> DfResult<Commutativity> {
|
||||
let partition_cols = partition_cols.unwrap_or_default();
|
||||
|
||||
match plan {
|
||||
let comm = match plan {
|
||||
LogicalPlan::Projection(proj) => {
|
||||
for expr in &proj.expr {
|
||||
let commutativity = Self::check_expr(expr);
|
||||
if !matches!(commutativity, Commutativity::Commutative) {
|
||||
return commutativity;
|
||||
return Ok(commutativity);
|
||||
}
|
||||
}
|
||||
Commutativity::Commutative
|
||||
@@ -142,24 +113,27 @@ impl Categorizer {
|
||||
let matches_partition = Self::check_partition(&aggr.group_expr, &partition_cols);
|
||||
if !matches_partition && is_all_steppable {
|
||||
debug!("Plan is steppable: {plan}");
|
||||
return Commutativity::TransformedCommutative {
|
||||
return Ok(Commutativity::TransformedCommutative {
|
||||
transformer: Some(Arc::new(|plan: &LogicalPlan| {
|
||||
debug!("Before Step optimize: {plan}");
|
||||
let ret = step_aggr_to_upper_aggr(plan);
|
||||
ret.ok().map(|s| TransformerAction {
|
||||
ret.inspect_err(|err| {
|
||||
common_telemetry::error!("Failed to step aggregate plan: {err:?}");
|
||||
})
|
||||
.map(|s| TransformerAction {
|
||||
extra_parent_plans: s.extra_parent_plans,
|
||||
new_child_plan: s.new_child_plan,
|
||||
})
|
||||
})),
|
||||
};
|
||||
});
|
||||
}
|
||||
if !matches_partition {
|
||||
return Commutativity::NonCommutative;
|
||||
return Ok(Commutativity::NonCommutative);
|
||||
}
|
||||
for expr in &aggr.aggr_expr {
|
||||
let commutativity = Self::check_expr(expr);
|
||||
if !matches!(commutativity, Commutativity::Commutative) {
|
||||
return commutativity;
|
||||
return Ok(commutativity);
|
||||
}
|
||||
}
|
||||
// all group by expressions are partition columns can push down, unless
|
||||
@@ -170,7 +144,7 @@ impl Categorizer {
|
||||
}
|
||||
LogicalPlan::Sort(_) => {
|
||||
if partition_cols.is_empty() {
|
||||
return Commutativity::Commutative;
|
||||
return Ok(Commutativity::Commutative);
|
||||
}
|
||||
|
||||
// sort plan needs to consider column priority
|
||||
@@ -187,7 +161,7 @@ impl Categorizer {
|
||||
LogicalPlan::TableScan(_) => Commutativity::Commutative,
|
||||
LogicalPlan::EmptyRelation(_) => Commutativity::NonCommutative,
|
||||
LogicalPlan::Subquery(_) => Commutativity::Unimplemented,
|
||||
LogicalPlan::SubqueryAlias(_) => Commutativity::Unimplemented,
|
||||
LogicalPlan::SubqueryAlias(_) => Commutativity::Commutative,
|
||||
LogicalPlan::Limit(limit) => {
|
||||
// Only execute `fetch` on remote nodes.
|
||||
// wait for https://github.com/apache/arrow-datafusion/pull/7669
|
||||
@@ -219,7 +193,9 @@ impl Categorizer {
|
||||
LogicalPlan::Ddl(_) => Commutativity::Unsupported,
|
||||
LogicalPlan::Copy(_) => Commutativity::Unsupported,
|
||||
LogicalPlan::RecursiveQuery(_) => Commutativity::Unsupported,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(comm)
|
||||
}
|
||||
|
||||
pub fn check_extension_plan(
|
||||
@@ -302,6 +278,10 @@ impl Categorizer {
|
||||
|
||||
/// Return true if the given expr and partition cols satisfied the rule.
|
||||
/// In this case the plan can be treated as fully commutative.
|
||||
///
|
||||
/// So only if all partition columns show up in `exprs`, return true.
|
||||
/// Otherwise return false.
|
||||
///
|
||||
fn check_partition(exprs: &[Expr], partition_cols: &AliasMapping) -> bool {
|
||||
let mut ref_cols = HashSet::new();
|
||||
for expr in exprs {
|
||||
@@ -330,7 +310,7 @@ impl Categorizer {
|
||||
pub type Transformer = Arc<dyn Fn(&LogicalPlan) -> Option<LogicalPlan>>;
|
||||
|
||||
/// Returns transformer action that need to be applied
|
||||
pub type StageTransformer = Arc<dyn Fn(&LogicalPlan) -> Option<TransformerAction>>;
|
||||
pub type StageTransformer = Arc<dyn Fn(&LogicalPlan) -> DfResult<TransformerAction>>;
|
||||
|
||||
/// The Action that a transformer should take on the plan.
|
||||
pub struct TransformerAction {
|
||||
@@ -365,7 +345,7 @@ mod test {
|
||||
fetch: None,
|
||||
});
|
||||
assert!(matches!(
|
||||
Categorizer::check_plan(&plan, Some(Default::default())),
|
||||
Categorizer::check_plan(&plan, Some(Default::default())).unwrap(),
|
||||
Commutativity::Commutative
|
||||
));
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ use store_api::storage::RegionId;
|
||||
use table::table_name::TableName;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::dist_plan::analyzer::AliasMapping;
|
||||
use crate::error::ConvertSchemaSnafu;
|
||||
use crate::metrics::{MERGE_SCAN_ERRORS_TOTAL, MERGE_SCAN_POLL_ELAPSED, MERGE_SCAN_REGIONS};
|
||||
use crate::region_query::RegionQueryHandlerRef;
|
||||
@@ -62,7 +63,7 @@ pub struct MergeScanLogicalPlan {
|
||||
input: LogicalPlan,
|
||||
/// If this plan is a placeholder
|
||||
is_placeholder: bool,
|
||||
partition_cols: Vec<String>,
|
||||
partition_cols: AliasMapping,
|
||||
}
|
||||
|
||||
impl UserDefinedLogicalNodeCore for MergeScanLogicalPlan {
|
||||
@@ -103,7 +104,7 @@ impl UserDefinedLogicalNodeCore for MergeScanLogicalPlan {
|
||||
}
|
||||
|
||||
impl MergeScanLogicalPlan {
|
||||
pub fn new(input: LogicalPlan, is_placeholder: bool, partition_cols: Vec<String>) -> Self {
|
||||
pub fn new(input: LogicalPlan, is_placeholder: bool, partition_cols: AliasMapping) -> Self {
|
||||
Self {
|
||||
input,
|
||||
is_placeholder,
|
||||
@@ -130,7 +131,7 @@ impl MergeScanLogicalPlan {
|
||||
&self.input
|
||||
}
|
||||
|
||||
pub fn partition_cols(&self) -> &[String] {
|
||||
pub fn partition_cols(&self) -> &AliasMapping {
|
||||
&self.partition_cols
|
||||
}
|
||||
}
|
||||
@@ -150,7 +151,7 @@ pub struct MergeScanExec {
|
||||
partition_metrics: Arc<Mutex<HashMap<usize, PartitionMetrics>>>,
|
||||
query_ctx: QueryContextRef,
|
||||
target_partition: usize,
|
||||
partition_cols: Vec<String>,
|
||||
partition_cols: AliasMapping,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MergeScanExec {
|
||||
@@ -175,7 +176,7 @@ impl MergeScanExec {
|
||||
region_query_handler: RegionQueryHandlerRef,
|
||||
query_ctx: QueryContextRef,
|
||||
target_partition: usize,
|
||||
partition_cols: Vec<String>,
|
||||
partition_cols: AliasMapping,
|
||||
) -> Result<Self> {
|
||||
// TODO(CookiePieWw): Initially we removed the metadata from the schema in #2000, but we have to
|
||||
// keep it for #4619 to identify json type in src/datatypes/src/schema/column_schema.rs.
|
||||
@@ -215,12 +216,18 @@ impl MergeScanExec {
|
||||
let partition_exprs = partition_cols
|
||||
.iter()
|
||||
.filter_map(|col| {
|
||||
session_state
|
||||
.create_physical_expr(
|
||||
Expr::Column(ColumnExpr::new_unqualified(col)),
|
||||
plan.schema(),
|
||||
)
|
||||
.ok()
|
||||
if let Some(first_alias) = col.1.first() {
|
||||
session_state
|
||||
.create_physical_expr(
|
||||
Expr::Column(ColumnExpr::new_unqualified(
|
||||
first_alias.name().to_string(),
|
||||
)),
|
||||
plan.schema(),
|
||||
)
|
||||
.ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let partitioning = Partitioning::Hash(partition_exprs, target_partition);
|
||||
@@ -420,17 +427,22 @@ impl MergeScanExec {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut hash_cols = HashSet::default();
|
||||
let all_partition_col_aliases: HashSet<_> = self
|
||||
.partition_cols
|
||||
.values()
|
||||
.flat_map(|aliases| aliases.iter().map(|c| c.name()))
|
||||
.collect();
|
||||
let mut overlaps = vec![];
|
||||
for expr in &hash_exprs {
|
||||
if let Some(col_expr) = expr.as_any().downcast_ref::<Column>() {
|
||||
hash_cols.insert(col_expr.name());
|
||||
if let Some(col_expr) = expr.as_any().downcast_ref::<Column>()
|
||||
&& all_partition_col_aliases.contains(col_expr.name())
|
||||
{
|
||||
overlaps.push(expr.clone());
|
||||
}
|
||||
}
|
||||
for col in &self.partition_cols {
|
||||
if !hash_cols.contains(col.as_str()) {
|
||||
// The partitioning columns are not the same
|
||||
return None;
|
||||
}
|
||||
|
||||
if overlaps.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(Self {
|
||||
@@ -443,7 +455,7 @@ impl MergeScanExec {
|
||||
metric: self.metric.clone(),
|
||||
properties: PlanProperties::new(
|
||||
self.properties.eq_properties.clone(),
|
||||
Partitioning::Hash(hash_exprs, self.target_partition),
|
||||
Partitioning::Hash(overlaps, self.target_partition),
|
||||
self.properties.emission_type,
|
||||
self.properties.boundedness,
|
||||
),
|
||||
|
||||
@@ -177,7 +177,7 @@ impl ExtensionPlanner for DistExtensionPlanner {
|
||||
self.region_query_handler.clone(),
|
||||
query_ctx,
|
||||
session_state.config().target_partitions(),
|
||||
merge_scan.partition_cols().to_vec(),
|
||||
merge_scan.partition_cols().clone(),
|
||||
)?;
|
||||
Ok(Some(Arc::new(merge_scan_plan) as _))
|
||||
}
|
||||
|
||||
@@ -88,6 +88,10 @@ impl CountWildcardToTimeIndexRule {
|
||||
// check if the time index is a valid column as for current plan
|
||||
if let Some(col) = &col {
|
||||
let mut is_valid = false;
|
||||
// if more than one input, we give up and just use `count(1)`
|
||||
if plan.inputs().len() > 1 {
|
||||
return None;
|
||||
}
|
||||
for input in plan.inputs() {
|
||||
if input.schema().has_column(col) {
|
||||
is_valid = true;
|
||||
@@ -171,6 +175,11 @@ impl TreeNodeVisitor<'_> for TimeIndexFinder {
|
||||
}
|
||||
}
|
||||
|
||||
if node.inputs().len() > 1 {
|
||||
// if more than one input, we give up and just use `count(1)`
|
||||
return Ok(TreeNodeRecursion::Stop);
|
||||
}
|
||||
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@ use std::sync::Arc;
|
||||
use datafusion::config::ConfigOptions;
|
||||
use datafusion::physical_optimizer::PhysicalOptimizerRule;
|
||||
use datafusion::physical_plan::ExecutionPlan;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode};
|
||||
use datafusion_common::Result as DfResult;
|
||||
use datafusion_physical_expr::Distribution;
|
||||
|
||||
@@ -56,26 +55,52 @@ impl PassDistribution {
|
||||
plan: Arc<dyn ExecutionPlan>,
|
||||
_config: &ConfigOptions,
|
||||
) -> DfResult<Arc<dyn ExecutionPlan>> {
|
||||
let mut distribution_requirement = None;
|
||||
let result = plan.transform_down(|plan| {
|
||||
if let Some(distribution) = plan.required_input_distribution().first()
|
||||
&& !matches!(distribution, Distribution::UnspecifiedDistribution)
|
||||
// incorrect workaround, doesn't fix the actual issue
|
||||
&& plan.name() != "HashJoinExec"
|
||||
{
|
||||
distribution_requirement = Some(distribution.clone());
|
||||
}
|
||||
// Start from root with no requirement
|
||||
Self::rewrite_with_distribution(plan, None)
|
||||
}
|
||||
|
||||
if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>()
|
||||
&& let Some(distribution) = distribution_requirement.as_ref()
|
||||
&& let Some(new_plan) = merge_scan.try_with_new_distribution(distribution.clone())
|
||||
{
|
||||
Ok(Transformed::yes(Arc::new(new_plan) as _))
|
||||
} else {
|
||||
Ok(Transformed::no(plan))
|
||||
}
|
||||
})?;
|
||||
/// Top-down rewrite that propagates distribution requirements to children.
|
||||
fn rewrite_with_distribution(
|
||||
plan: Arc<dyn ExecutionPlan>,
|
||||
current_req: Option<Distribution>,
|
||||
) -> DfResult<Arc<dyn ExecutionPlan>> {
|
||||
// If this is a MergeScanExec, try to apply the current requirement.
|
||||
if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>()
|
||||
&& let Some(distribution) = current_req.as_ref()
|
||||
&& let Some(new_plan) = merge_scan.try_with_new_distribution(distribution.clone())
|
||||
{
|
||||
// Leaf node; no children to process
|
||||
return Ok(Arc::new(new_plan) as _);
|
||||
}
|
||||
|
||||
Ok(result.data)
|
||||
// Compute per-child requirements from the current node.
|
||||
let children = plan.children();
|
||||
if children.is_empty() {
|
||||
return Ok(plan);
|
||||
}
|
||||
|
||||
let required = plan.required_input_distribution();
|
||||
let mut new_children = Vec::with_capacity(children.len());
|
||||
for (idx, child) in children.into_iter().enumerate() {
|
||||
let child_req = match required.get(idx) {
|
||||
Some(Distribution::UnspecifiedDistribution) => None,
|
||||
None => current_req.clone(),
|
||||
Some(req) => Some(req.clone()),
|
||||
};
|
||||
let new_child = Self::rewrite_with_distribution(child.clone(), child_req)?;
|
||||
new_children.push(new_child);
|
||||
}
|
||||
|
||||
// Rebuild the node only if any child changed (pointer inequality)
|
||||
let unchanged = plan
|
||||
.children()
|
||||
.into_iter()
|
||||
.zip(new_children.iter())
|
||||
.all(|(old, new)| Arc::ptr_eq(old, new));
|
||||
if unchanged {
|
||||
Ok(plan)
|
||||
} else {
|
||||
plan.with_new_children(new_children)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::collections::HashSet;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use arrow_schema::SortOptions;
|
||||
use common_function::aggrs::aggr_wrapper::aggr_state_func_name;
|
||||
use common_recordbatch::OrderOption;
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor};
|
||||
@@ -217,7 +218,8 @@ impl TreeNodeVisitor<'_> for ScanHintVisitor {
|
||||
is_all_last_value = false;
|
||||
break;
|
||||
};
|
||||
if func.func.name() != "last_value"
|
||||
if (func.func.name() != "last_value"
|
||||
&& func.func.name() != aggr_state_func_name("last_value"))
|
||||
|| func.params.filter.is_some()
|
||||
|| func.params.distinct
|
||||
{
|
||||
|
||||
@@ -282,6 +282,16 @@ impl DfLogicalPlanner {
|
||||
.build()
|
||||
.context(PlanSqlSnafu)?;
|
||||
}
|
||||
|
||||
// Wrap in SubqueryAlias to ensure proper table qualification for CTE
|
||||
logical_plan = LogicalPlan::SubqueryAlias(
|
||||
datafusion_expr::SubqueryAlias::try_new(
|
||||
Arc::new(logical_plan),
|
||||
cte.name.value.clone(),
|
||||
)
|
||||
.context(PlanSqlSnafu)?,
|
||||
);
|
||||
|
||||
planner_context.insert_cte(&cte.name.value, logical_plan);
|
||||
}
|
||||
CteContent::Sql(_) => {
|
||||
|
||||
@@ -1222,12 +1222,13 @@ impl PromPlanner {
|
||||
let mut exprs = Vec::with_capacity(labels.labels.len());
|
||||
for label in &labels.labels {
|
||||
// nonexistence label will be ignored
|
||||
if let Ok(field) = input_schema.field_with_unqualified_name(label) {
|
||||
exprs.push(DfExpr::Column(Column::from(field.name())));
|
||||
if let Some(column_name) = Self::find_case_sensitive_column(input_schema, label)
|
||||
{
|
||||
exprs.push(DfExpr::Column(Column::from_name(column_name.clone())));
|
||||
|
||||
if update_ctx {
|
||||
// update the tag columns in context
|
||||
self.ctx.tag_columns.push(label.clone());
|
||||
self.ctx.tag_columns.push(column_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1290,13 +1291,12 @@ impl PromPlanner {
|
||||
continue;
|
||||
}
|
||||
|
||||
let col = if table_schema
|
||||
.field_with_unqualified_name(&matcher.name)
|
||||
.is_err()
|
||||
{
|
||||
DfExpr::Literal(ScalarValue::Utf8(Some(String::new())), None).alias(matcher.name)
|
||||
let column_name = Self::find_case_sensitive_column(table_schema, matcher.name.as_str());
|
||||
let col = if let Some(column_name) = column_name {
|
||||
DfExpr::Column(Column::from_name(column_name))
|
||||
} else {
|
||||
DfExpr::Column(Column::from_name(matcher.name))
|
||||
DfExpr::Literal(ScalarValue::Utf8(Some(String::new())), None)
|
||||
.alias(matcher.name.clone())
|
||||
};
|
||||
let lit = DfExpr::Literal(ScalarValue::Utf8(Some(matcher.value)), None);
|
||||
let expr = match matcher.op {
|
||||
@@ -1353,6 +1353,14 @@ impl PromPlanner {
|
||||
Ok(exprs)
|
||||
}
|
||||
|
||||
fn find_case_sensitive_column(schema: &DFSchemaRef, column: &str) -> Option<String> {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.find(|field| field.name() == column)
|
||||
.map(|field| field.name().clone())
|
||||
}
|
||||
|
||||
fn table_ref(&self) -> Result<TableReference> {
|
||||
let table_name = self
|
||||
.ctx
|
||||
|
||||
@@ -19,6 +19,7 @@ use std::sync::{Arc, RwLock};
|
||||
use async_trait::async_trait;
|
||||
use catalog::CatalogManagerRef;
|
||||
use common_base::Plugins;
|
||||
use common_function::aggrs::aggr_wrapper::fix_order::FixStateUdafOrderingAnalyzer;
|
||||
use common_function::function_factory::ScalarFunctionFactory;
|
||||
use common_function::handlers::{
|
||||
FlowServiceHandlerRef, ProcedureServiceHandlerRef, TableMutationHandlerRef,
|
||||
@@ -136,6 +137,8 @@ impl QueryEngineState {
|
||||
analyzer.rules.push(Arc::new(DistPlannerAnalyzer));
|
||||
}
|
||||
|
||||
analyzer.rules.push(Arc::new(FixStateUdafOrderingAnalyzer));
|
||||
|
||||
let mut optimizer = Optimizer::new();
|
||||
optimizer.rules.push(Arc::new(ScanHintRule));
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
v0.11.3
|
||||
v0.11.6
|
||||
|
||||
@@ -71,8 +71,8 @@ impl JemallocCollector {
|
||||
let _ = self.epoch.advance().context(UpdateJemallocMetricsSnafu)?;
|
||||
let allocated = self.allocated.read().context(UpdateJemallocMetricsSnafu)?;
|
||||
let resident = self.resident.read().context(UpdateJemallocMetricsSnafu)?;
|
||||
SYS_JEMALLOC_RESIDEN.set(allocated as i64);
|
||||
SYS_JEMALLOC_ALLOCATED.set(resident as i64);
|
||||
SYS_JEMALLOC_ALLOCATED.set(allocated as i64);
|
||||
SYS_JEMALLOC_RESIDEN.set(resident as i64);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ use ahash::{HashMap, HashSet};
|
||||
use api::v1::{RowInsertRequests, Value};
|
||||
use common_grpc::precision::Precision;
|
||||
use common_query::prelude::{GREPTIME_COUNT, GREPTIME_TIMESTAMP, GREPTIME_VALUE};
|
||||
use itertools::Itertools;
|
||||
use lazy_static::lazy_static;
|
||||
use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest;
|
||||
use otel_arrow_rust::proto::opentelemetry::common::v1::{any_value, AnyValue, KeyValue};
|
||||
@@ -251,10 +250,20 @@ fn process_scope_attrs(scope: &ScopeMetrics, metric_ctx: &OtlpMetricCtx) -> Opti
|
||||
|
||||
// See https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/145942706622aba5c276ca47f48df438228bfea4/pkg/translator/prometheus/normalize_name.go#L55
|
||||
pub fn normalize_metric_name(metric: &Metric, metric_type: &MetricType) -> String {
|
||||
let mut name_tokens = NON_ALPHA_NUM_CHAR
|
||||
// Split metric name in "tokens" (remove all non-alphanumeric), filtering out empty strings
|
||||
let mut name_tokens: Vec<String> = NON_ALPHA_NUM_CHAR
|
||||
.split(&metric.name)
|
||||
.map(|s| s.to_string())
|
||||
.collect_vec();
|
||||
.filter_map(|s| {
|
||||
let trimmed = s.trim();
|
||||
if trimmed.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(trimmed.to_string())
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Append unit if it exists
|
||||
if !metric.unit.is_empty() {
|
||||
let (main, per) = build_unit_suffix(&metric.unit);
|
||||
if let Some(main) = main
|
||||
@@ -270,17 +279,24 @@ pub fn normalize_metric_name(metric: &Metric, metric_type: &MetricType) -> Strin
|
||||
}
|
||||
}
|
||||
|
||||
// Append _total for Counters (monotonic sums)
|
||||
if matches!(metric_type, MetricType::MonotonicSum) {
|
||||
// Remove existing "total" tokens first, then append
|
||||
name_tokens.retain(|t| t != TOTAL);
|
||||
name_tokens.push(TOTAL.to_string());
|
||||
}
|
||||
|
||||
// Append _ratio for metrics with unit "1" (gauges only)
|
||||
if metric.unit == "1" && matches!(metric_type, MetricType::Gauge) {
|
||||
// Remove existing "ratio" tokens first, then append
|
||||
name_tokens.retain(|t| t != RATIO);
|
||||
name_tokens.push(RATIO.to_string());
|
||||
}
|
||||
|
||||
// Build the string from the tokens, separated with underscores
|
||||
let name = name_tokens.join(UNDERSCORE);
|
||||
|
||||
// Metric name cannot start with a digit, so prefix it with "_" in this case
|
||||
if let Some((_, first)) = name.char_indices().next()
|
||||
&& first >= '0'
|
||||
&& first <= '9'
|
||||
@@ -298,7 +314,8 @@ fn build_unit_suffix(unit: &str) -> (Option<String>, Option<String>) {
|
||||
|
||||
fn check_unit(unit_str: &str, unit_map: &HashMap<String, String>) -> Option<String> {
|
||||
let u = unit_str.trim();
|
||||
if !u.is_empty() && !u.contains("{}") {
|
||||
// Skip units that are empty, contain "{" or "}" characters
|
||||
if !u.is_empty() && !u.contains('{') && !u.contains('}') {
|
||||
let u = unit_map.get(u).map(|s| s.as_ref()).unwrap_or(u);
|
||||
let u = clean_unit_name(u);
|
||||
if !u.is_empty() {
|
||||
@@ -309,7 +326,13 @@ fn check_unit(unit_str: &str, unit_map: &HashMap<String, String>) -> Option<Stri
|
||||
}
|
||||
|
||||
fn clean_unit_name(name: &str) -> String {
|
||||
NON_ALPHA_NUM_CHAR.split(name).join(UNDERSCORE)
|
||||
// Split on non-alphanumeric characters, filter out empty strings, then join with underscores
|
||||
// This matches the Go implementation: strings.FieldsFunc + strings.Join
|
||||
NON_ALPHA_NUM_CHAR
|
||||
.split(name)
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join(UNDERSCORE)
|
||||
}
|
||||
|
||||
// See https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/145942706622aba5c276ca47f48df438228bfea4/pkg/translator/prometheus/normalize_label.go#L27
|
||||
@@ -1037,6 +1060,57 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_metric_name_edge_cases() {
|
||||
let test_cases = vec![
|
||||
// Edge case: name with multiple non-alphanumeric chars in a row
|
||||
(
|
||||
Metric {
|
||||
name: "foo--bar__baz".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Init,
|
||||
"foo_bar_baz",
|
||||
),
|
||||
// Edge case: name starting and ending with non-alphanumeric
|
||||
(
|
||||
Metric {
|
||||
name: "-foo_bar-".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Init,
|
||||
"foo_bar",
|
||||
),
|
||||
// Edge case: name with only special chars (should be empty)
|
||||
(
|
||||
Metric {
|
||||
name: "--___--".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Init,
|
||||
"",
|
||||
),
|
||||
// Edge case: name starting with digit
|
||||
(
|
||||
Metric {
|
||||
name: "2xx_requests".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Init,
|
||||
"_2xx_requests",
|
||||
),
|
||||
];
|
||||
|
||||
for (metric, metric_type, expected) in test_cases {
|
||||
let result = normalize_metric_name(&metric, &metric_type);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Failed for metric name: '{}', unit: '{}', type: {:?}",
|
||||
metric.name, metric.unit, metric_type
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_label_name() {
|
||||
let test_cases = vec![
|
||||
@@ -1058,6 +1132,320 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clean_unit_name() {
|
||||
// Test the improved clean_unit_name function
|
||||
assert_eq!(clean_unit_name("faults"), "faults");
|
||||
assert_eq!(clean_unit_name("{faults}"), "faults"); // clean_unit_name still processes braces internally
|
||||
assert_eq!(clean_unit_name("req/sec"), "req_sec");
|
||||
assert_eq!(clean_unit_name("m/s"), "m_s");
|
||||
assert_eq!(clean_unit_name("___test___"), "test");
|
||||
assert_eq!(
|
||||
clean_unit_name("multiple__underscores"),
|
||||
"multiple_underscores"
|
||||
);
|
||||
assert_eq!(clean_unit_name(""), "");
|
||||
assert_eq!(clean_unit_name("___"), "");
|
||||
assert_eq!(clean_unit_name("bytes.per.second"), "bytes_per_second");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_metric_name_braced_units() {
|
||||
// Test that units with braces are rejected (not processed)
|
||||
let test_cases = vec![
|
||||
(
|
||||
Metric {
|
||||
name: "test.metric".to_string(),
|
||||
unit: "{faults}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"test_metric_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "test.metric".to_string(),
|
||||
unit: "{operations}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"test_metric", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "test.metric".to_string(),
|
||||
unit: "{}".to_string(), // empty braces should be ignored due to contains('{') || contains('}')
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"test_metric",
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "test.metric".to_string(),
|
||||
unit: "faults".to_string(), // no braces, should work normally
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"test_metric_faults",
|
||||
),
|
||||
];
|
||||
|
||||
for (metric, metric_type, expected) in test_cases {
|
||||
let result = normalize_metric_name(&metric, &metric_type);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Failed for metric name: '{}', unit: '{}', type: {:?}. Got: '{}', Expected: '{}'",
|
||||
metric.name, metric.unit, metric_type, result, expected
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_metric_name_with_testdata() {
|
||||
// Test cases extracted from real OTLP metrics data from testdata.txt
|
||||
let test_cases = vec![
|
||||
// Basic system metrics with various units
|
||||
(
|
||||
Metric {
|
||||
name: "system.paging.faults".to_string(),
|
||||
unit: "{faults}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_paging_faults_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.paging.operations".to_string(),
|
||||
unit: "{operations}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_paging_operations_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.paging.usage".to_string(),
|
||||
unit: "By".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::NonMonotonicSum,
|
||||
"system_paging_usage_bytes",
|
||||
),
|
||||
// Load average metrics - gauge with custom unit
|
||||
(
|
||||
Metric {
|
||||
name: "system.cpu.load_average.15m".to_string(),
|
||||
unit: "{thread}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"system_cpu_load_average_15m", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.cpu.load_average.1m".to_string(),
|
||||
unit: "{thread}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"system_cpu_load_average_1m", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
// Disk I/O with bytes unit
|
||||
(
|
||||
Metric {
|
||||
name: "system.disk.io".to_string(),
|
||||
unit: "By".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_disk_io_bytes_total",
|
||||
),
|
||||
// Time-based metrics with seconds unit
|
||||
(
|
||||
Metric {
|
||||
name: "system.disk.io_time".to_string(),
|
||||
unit: "s".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_disk_io_time_seconds_total",
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.disk.operation_time".to_string(),
|
||||
unit: "s".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_disk_operation_time_seconds_total",
|
||||
),
|
||||
// CPU time metric
|
||||
(
|
||||
Metric {
|
||||
name: "system.cpu.time".to_string(),
|
||||
unit: "s".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_cpu_time_seconds_total",
|
||||
),
|
||||
// Process counts
|
||||
(
|
||||
Metric {
|
||||
name: "system.processes.count".to_string(),
|
||||
unit: "{processes}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::NonMonotonicSum,
|
||||
"system_processes_count", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.processes.created".to_string(),
|
||||
unit: "{processes}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_processes_created_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
// Memory usage with bytes
|
||||
(
|
||||
Metric {
|
||||
name: "system.memory.usage".to_string(),
|
||||
unit: "By".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::NonMonotonicSum,
|
||||
"system_memory_usage_bytes",
|
||||
),
|
||||
// Uptime as gauge
|
||||
(
|
||||
Metric {
|
||||
name: "system.uptime".to_string(),
|
||||
unit: "s".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"system_uptime_seconds",
|
||||
),
|
||||
// Network metrics
|
||||
(
|
||||
Metric {
|
||||
name: "system.network.connections".to_string(),
|
||||
unit: "{connections}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::NonMonotonicSum,
|
||||
"system_network_connections", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.network.dropped".to_string(),
|
||||
unit: "{packets}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_network_dropped_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.network.errors".to_string(),
|
||||
unit: "{errors}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_network_errors_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.network.io".to_string(),
|
||||
unit: "By".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_network_io_bytes_total",
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.network.packets".to_string(),
|
||||
unit: "{packets}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"system_network_packets_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
// Filesystem metrics
|
||||
(
|
||||
Metric {
|
||||
name: "system.filesystem.inodes.usage".to_string(),
|
||||
unit: "{inodes}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::NonMonotonicSum,
|
||||
"system_filesystem_inodes_usage", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "system.filesystem.usage".to_string(),
|
||||
unit: "By".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::NonMonotonicSum,
|
||||
"system_filesystem_usage_bytes",
|
||||
),
|
||||
// Edge cases with special characters and numbers
|
||||
(
|
||||
Metric {
|
||||
name: "system.load.1".to_string(),
|
||||
unit: "1".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"system_load_1_ratio",
|
||||
),
|
||||
(
|
||||
Metric {
|
||||
name: "http.request.2xx".to_string(),
|
||||
unit: "{requests}".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::MonotonicSum,
|
||||
"http_request_2xx_total", // braced units are rejected, no unit suffix added
|
||||
),
|
||||
// Metric with dots and underscores mixed
|
||||
(
|
||||
Metric {
|
||||
name: "jvm.memory.heap_usage".to_string(),
|
||||
unit: "By".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"jvm_memory_heap_usage_bytes",
|
||||
),
|
||||
// Complex unit with per-second
|
||||
(
|
||||
Metric {
|
||||
name: "http.request.rate".to_string(),
|
||||
unit: "1/s".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
MetricType::Gauge,
|
||||
"http_request_rate_per_second",
|
||||
),
|
||||
];
|
||||
|
||||
for (metric, metric_type, expected) in test_cases {
|
||||
let result = normalize_metric_name(&metric, &metric_type);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Failed for metric name: '{}', unit: '{}', type: {:?}. Got: '{}', Expected: '{}'",
|
||||
metric.name, metric.unit, metric_type, result, expected
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn keyvalue(key: &str, value: &str) -> KeyValue {
|
||||
KeyValue {
|
||||
key: key.into(),
|
||||
|
||||
@@ -292,11 +292,18 @@ impl<'a> ParserContext<'a> {
|
||||
|
||||
let output_table_name = self.intern_parse_table_name()?;
|
||||
|
||||
let expire_after = if self
|
||||
.parser
|
||||
.consume_tokens(&[Token::make_keyword(EXPIRE), Token::make_keyword(AFTER)])
|
||||
let expire_after = if let Token::Word(w1) = &self.parser.peek_token().token
|
||||
&& w1.value.eq_ignore_ascii_case(EXPIRE)
|
||||
{
|
||||
Some(self.parse_interval_no_month("EXPIRE AFTER")?)
|
||||
self.parser.next_token();
|
||||
if let Token::Word(w2) = &self.parser.peek_token().token
|
||||
&& w2.value.eq_ignore_ascii_case(AFTER)
|
||||
{
|
||||
self.parser.next_token();
|
||||
Some(self.parse_interval_no_month("EXPIRE AFTER")?)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -1500,6 +1507,45 @@ SELECT max(c1), min(c2) FROM schema_2.table_2;",
|
||||
comment: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
r"
|
||||
create flow `task_3`
|
||||
sink to schema_1.table_1
|
||||
expire after '10 minutes'
|
||||
as
|
||||
select max(c1), min(c2) from schema_2.table_2;",
|
||||
CreateFlowWoutQuery {
|
||||
flow_name: ObjectName::from(vec![Ident::with_quote('`', "task_3")]),
|
||||
sink_table_name: ObjectName::from(vec![
|
||||
Ident::new("schema_1"),
|
||||
Ident::new("table_1"),
|
||||
]),
|
||||
or_replace: false,
|
||||
if_not_exists: false,
|
||||
expire_after: Some(600), // 10 minutes in seconds
|
||||
comment: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
r"
|
||||
create or replace flow if not exists task_4
|
||||
sink to schema_1.table_1
|
||||
expire after interval '2 hours'
|
||||
comment 'lowercase test'
|
||||
as
|
||||
select max(c1), min(c2) from schema_2.table_2;",
|
||||
CreateFlowWoutQuery {
|
||||
flow_name: ObjectName::from(vec![Ident::new("task_4")]),
|
||||
sink_table_name: ObjectName::from(vec![
|
||||
Ident::new("schema_1"),
|
||||
Ident::new("table_1"),
|
||||
]),
|
||||
or_replace: true,
|
||||
if_not_exists: true,
|
||||
expire_after: Some(7200), // 2 hours in seconds
|
||||
comment: Some("lowercase test".to_string()),
|
||||
},
|
||||
),
|
||||
];
|
||||
|
||||
for (sql, expected) in testcases {
|
||||
|
||||
@@ -34,7 +34,6 @@ const FORMAT: &str = "FORMAT";
|
||||
|
||||
use sqlparser::parser::Parser;
|
||||
|
||||
use crate::dialect::GreptimeDbDialect;
|
||||
use crate::parsers::error::{
|
||||
ConvertToLogicalExpressionSnafu, EvaluationSnafu, ParserSnafu, TQLError,
|
||||
};
|
||||
@@ -106,36 +105,49 @@ impl ParserContext<'_> {
|
||||
let (start, end, step, lookback) = match parser.peek_token().token {
|
||||
Token::LParen => {
|
||||
let _consume_lparen_token = parser.next_token();
|
||||
let start = Self::parse_string_or_number_or_word(
|
||||
parser,
|
||||
&[Token::Comma],
|
||||
require_now_expr,
|
||||
)?
|
||||
.0;
|
||||
let end = Self::parse_string_or_number_or_word(
|
||||
parser,
|
||||
&[Token::Comma],
|
||||
require_now_expr,
|
||||
)?
|
||||
.0;
|
||||
let exprs = parser
|
||||
.parse_comma_separated(Parser::parse_expr)
|
||||
.context(ParserSnafu)?;
|
||||
|
||||
let (step, delimiter) = Self::parse_string_or_number_or_word(
|
||||
parser,
|
||||
&[Token::Comma, Token::RParen],
|
||||
false,
|
||||
let param_count = exprs.len();
|
||||
|
||||
if param_count != 3 && param_count != 4 {
|
||||
return Err(ParserError::ParserError(
|
||||
format!("Expected 3 or 4 expressions in TQL parameters (start, end, step, [lookback]), but found {}", param_count)
|
||||
))
|
||||
.context(ParserSnafu);
|
||||
}
|
||||
|
||||
let mut exprs_iter = exprs.into_iter();
|
||||
// Safety: safe to call next and unwrap, because we already check the param_count above.
|
||||
let start = Self::parse_expr_to_literal_or_ts(
|
||||
exprs_iter.next().unwrap(),
|
||||
require_now_expr,
|
||||
)?;
|
||||
let lookback = if delimiter == Token::Comma {
|
||||
Self::parse_string_or_number_or_word(parser, &[Token::RParen], false)
|
||||
.ok()
|
||||
.map(|t| t.0)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let end = Self::parse_expr_to_literal_or_ts(
|
||||
exprs_iter.next().unwrap(),
|
||||
require_now_expr,
|
||||
)?;
|
||||
let step = Self::parse_expr_to_literal_or_ts(exprs_iter.next().unwrap(), false)?;
|
||||
|
||||
let lookback = exprs_iter
|
||||
.next()
|
||||
.map(|expr| Self::parse_expr_to_literal_or_ts(expr, false))
|
||||
.transpose()?;
|
||||
|
||||
if !parser.consume_token(&Token::RParen) {
|
||||
return Err(ParserError::ParserError(format!(
|
||||
"Expected ')' after TQL parameters, but found: {}",
|
||||
parser.peek_token()
|
||||
)))
|
||||
.context(ParserSnafu);
|
||||
}
|
||||
|
||||
(start, end, step, lookback)
|
||||
}
|
||||
_ => ("0".to_string(), "0".to_string(), "5m".to_string(), None),
|
||||
};
|
||||
|
||||
let query = Self::parse_tql_query(parser, self.sql).context(ParserSnafu)?;
|
||||
Ok(TqlParameters::new(start, end, step, lookback, query))
|
||||
}
|
||||
@@ -179,72 +191,43 @@ impl ParserContext<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to parse and consume a string, number or word token.
|
||||
/// Return `Ok` if it's parsed and one of the given delimiter tokens is consumed.
|
||||
/// The string and matched delimiter will be returned as a tuple.
|
||||
fn parse_string_or_number_or_word(
|
||||
parser: &mut Parser,
|
||||
delimiter_tokens: &[Token],
|
||||
require_now_expr: bool,
|
||||
) -> std::result::Result<(String, Token), TQLError> {
|
||||
let mut tokens = vec![];
|
||||
|
||||
while !delimiter_tokens.contains(&parser.peek_token().token) {
|
||||
let token = parser.next_token().token;
|
||||
if matches!(token, Token::EOF) {
|
||||
break;
|
||||
}
|
||||
tokens.push(token);
|
||||
}
|
||||
let result = match tokens.len() {
|
||||
0 => Err(ParserError::ParserError(
|
||||
"Expected at least one token".to_string(),
|
||||
))
|
||||
.context(ParserSnafu),
|
||||
1 => {
|
||||
let value = match tokens[0].clone() {
|
||||
Token::Number(n, _) if !require_now_expr => n,
|
||||
Token::DoubleQuotedString(s) | Token::SingleQuotedString(s)
|
||||
if !require_now_expr =>
|
||||
{
|
||||
s
|
||||
}
|
||||
Token::Word(_) => Self::parse_tokens_to_ts(tokens, require_now_expr)?,
|
||||
unexpected => {
|
||||
if !require_now_expr {
|
||||
return Err(ParserError::ParserError(format!(
|
||||
"Expected number, string or word, but have {unexpected:?}"
|
||||
)))
|
||||
.context(ParserSnafu);
|
||||
} else {
|
||||
return Err(ParserError::ParserError(format!(
|
||||
"Expected expression containing `now()`, but have {unexpected:?}"
|
||||
)))
|
||||
.context(ParserSnafu);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(value)
|
||||
}
|
||||
_ => Self::parse_tokens_to_ts(tokens, require_now_expr),
|
||||
};
|
||||
for token in delimiter_tokens {
|
||||
if parser.consume_token(token) {
|
||||
return result.map(|v| (v, token.clone()));
|
||||
}
|
||||
}
|
||||
Err(ParserError::ParserError(format!(
|
||||
"Delimiters not match {delimiter_tokens:?}"
|
||||
)))
|
||||
.context(ParserSnafu)
|
||||
}
|
||||
|
||||
/// Parse the tokens to seconds and convert to string.
|
||||
fn parse_tokens_to_ts(
|
||||
tokens: Vec<Token>,
|
||||
/// Parse the expression to a literal string or a timestamp in seconds.
|
||||
fn parse_expr_to_literal_or_ts(
|
||||
parser_expr: sqlparser::ast::Expr,
|
||||
require_now_expr: bool,
|
||||
) -> std::result::Result<String, TQLError> {
|
||||
match parser_expr {
|
||||
sqlparser::ast::Expr::Value(v) => match v.value {
|
||||
sqlparser::ast::Value::Number(s, _) if !require_now_expr => Ok(s),
|
||||
sqlparser::ast::Value::DoubleQuotedString(s)
|
||||
| sqlparser::ast::Value::SingleQuotedString(s)
|
||||
if !require_now_expr =>
|
||||
{
|
||||
Ok(s)
|
||||
}
|
||||
unexpected => {
|
||||
if !require_now_expr {
|
||||
Err(ParserError::ParserError(format!(
|
||||
"Expected number, string or word, but have {unexpected:?}"
|
||||
)))
|
||||
.context(ParserSnafu)
|
||||
} else {
|
||||
Err(ParserError::ParserError(format!(
|
||||
"Expected expression containing `now()`, but have {unexpected:?}"
|
||||
)))
|
||||
.context(ParserSnafu)
|
||||
}
|
||||
}
|
||||
},
|
||||
_ => Self::parse_expr_to_ts(parser_expr, require_now_expr),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse the expression to a timestamp in seconds.
|
||||
fn parse_expr_to_ts(
|
||||
parser_expr: sqlparser::ast::Expr,
|
||||
require_now_expr: bool,
|
||||
) -> std::result::Result<String, TQLError> {
|
||||
let parser_expr = Self::parse_to_expr(tokens)?;
|
||||
let lit = utils::parser_expr_to_scalar_value_literal(parser_expr, require_now_expr)
|
||||
.map_err(Box::new)
|
||||
.context(ConvertToLogicalExpressionSnafu)?;
|
||||
@@ -267,13 +250,6 @@ impl ParserContext<'_> {
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_to_expr(tokens: Vec<Token>) -> std::result::Result<sqlparser::ast::Expr, TQLError> {
|
||||
Parser::new(&GreptimeDbDialect {})
|
||||
.with_tokens(tokens)
|
||||
.parse_expr()
|
||||
.context(ParserSnafu)
|
||||
}
|
||||
|
||||
fn parse_tql_query(parser: &mut Parser, sql: &str) -> std::result::Result<String, ParserError> {
|
||||
while matches!(parser.peek_token().token, Token::Comma) {
|
||||
let _skip_token = parser.next_token();
|
||||
@@ -405,6 +381,60 @@ mod tests {
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_tql_eval_with_date_trunc() {
|
||||
let sql = "TQL EVAL (date_trunc('day', now() - interval '1' day), date_trunc('day', now()), '1h') http_requests_total{environment=~'staging|testing|development',method!='GET'} @ 1609746000 offset 5m";
|
||||
let statement = parse_into_statement(sql);
|
||||
match statement {
|
||||
Statement::Tql(Tql::Eval(eval)) => {
|
||||
// date_trunc('day', now() - interval '1' day) should resolve to start of yesterday
|
||||
// date_trunc('day', now()) should resolve to start of today
|
||||
// The exact values depend on when the test runs, but we can verify the structure
|
||||
assert!(eval.start.parse::<i64>().is_ok());
|
||||
assert!(eval.end.parse::<i64>().is_ok());
|
||||
assert_eq!(eval.step, "1h");
|
||||
assert_eq!(eval.lookback, None);
|
||||
assert_eq!(
|
||||
eval.query,
|
||||
"http_requests_total{environment=~'staging|testing|development',method!='GET'} @ 1609746000 offset 5m"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// Test with 4 parameters including lookback
|
||||
let sql = "TQL EVAL (date_trunc('hour', now() - interval '6' hour), date_trunc('hour', now()), '30m', '5m') cpu_usage_total";
|
||||
let statement = parse_into_statement(sql);
|
||||
match statement {
|
||||
Statement::Tql(Tql::Eval(eval)) => {
|
||||
assert!(eval.start.parse::<i64>().is_ok());
|
||||
assert!(eval.end.parse::<i64>().is_ok());
|
||||
assert_eq!(eval.step, "30m");
|
||||
assert_eq!(eval.lookback, Some("5m".to_string()));
|
||||
assert_eq!(eval.query, "cpu_usage_total");
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_tql_analyze_with_date_trunc() {
|
||||
let sql = "TQL ANALYZE VERBOSE FORMAT JSON (date_trunc('week', now() - interval '2' week), date_trunc('week', now()), '4h', '1h') network_bytes_total";
|
||||
let statement = parse_into_statement(sql);
|
||||
match statement {
|
||||
Statement::Tql(Tql::Analyze(analyze)) => {
|
||||
assert!(analyze.start.parse::<i64>().is_ok());
|
||||
assert!(analyze.end.parse::<i64>().is_ok());
|
||||
assert_eq!(analyze.step, "4h");
|
||||
assert_eq!(analyze.lookback, Some("1h".to_string()));
|
||||
assert_eq!(analyze.query, "network_bytes_total");
|
||||
assert!(analyze.is_verbose);
|
||||
assert_eq!(analyze.format, Some(AnalyzeFormat::JSON));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_tql_eval() {
|
||||
let sql = "TQL EVAL (1676887657, 1676887659, '1m') http_requests_total{environment=~'staging|testing|development',method!='GET'} @ 1609746000 offset 5m";
|
||||
@@ -901,17 +931,26 @@ mod tests {
|
||||
let sql = "TQL EVAL (1676887657, 1676887659, 1m) http_requests_total{environment=~'staging|testing|development',method!='GET'} @ 1609746000 offset 5m";
|
||||
let result =
|
||||
ParserContext::create_with_dialect(sql, dialect, parse_options.clone()).unwrap_err();
|
||||
assert!(result
|
||||
.output_msg()
|
||||
.contains("Failed to extract a timestamp value"));
|
||||
|
||||
assert!(
|
||||
result
|
||||
.output_msg()
|
||||
.contains("Expected ')' after TQL parameters, but found: m"),
|
||||
"{}",
|
||||
result.output_msg()
|
||||
);
|
||||
|
||||
// missing end
|
||||
let sql = "TQL EVAL (1676887657, '1m') http_requests_total{environment=~'staging|testing|development',method!='GET'} @ 1609746000 offset 5m";
|
||||
let result =
|
||||
ParserContext::create_with_dialect(sql, dialect, parse_options.clone()).unwrap_err();
|
||||
assert!(result
|
||||
.output_msg()
|
||||
.contains("Failed to extract a timestamp value"));
|
||||
assert!(
|
||||
result
|
||||
.output_msg()
|
||||
.contains("Expected 3 or 4 expressions in TQL parameters"),
|
||||
"{}",
|
||||
result.output_msg()
|
||||
);
|
||||
|
||||
// empty TQL query
|
||||
let sql = "TQL EVAL (0, 30, '10s')";
|
||||
@@ -923,6 +962,12 @@ mod tests {
|
||||
let sql = "tql eval (0, 0, '1s) t;;';";
|
||||
let result =
|
||||
ParserContext::create_with_dialect(sql, dialect, parse_options.clone()).unwrap_err();
|
||||
assert!(result.output_msg().contains("Delimiters not match"));
|
||||
assert!(
|
||||
result
|
||||
.output_msg()
|
||||
.contains("Expected ')' after TQL parameters, but found: ;"),
|
||||
"{}",
|
||||
result.output_msg()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::logstore::LogStore;
|
||||
use crate::storage::RegionId;
|
||||
|
||||
// The Provider of kafka log store
|
||||
@@ -78,6 +79,18 @@ impl Display for Provider {
|
||||
}
|
||||
|
||||
impl Provider {
|
||||
/// Returns the initial flushed entry id of the provider.
|
||||
/// This is used to initialize the flushed entry id of the region when creating the region from scratch.
|
||||
///
|
||||
/// Currently only used for remote WAL.
|
||||
/// For local WAL, the initial flushed entry id is 0.
|
||||
pub fn initial_flushed_entry_id<S: LogStore>(&self, wal: &S) -> u64 {
|
||||
if matches!(self, Provider::Kafka(_)) {
|
||||
return wal.latest_entry_id(self).unwrap_or(0);
|
||||
}
|
||||
0
|
||||
}
|
||||
|
||||
pub fn raft_engine_provider(id: u64) -> Provider {
|
||||
Provider::RaftEngine(RaftEngineProvider { id })
|
||||
}
|
||||
|
||||
@@ -1358,7 +1358,7 @@ pub enum RegionTruncateRequest {
|
||||
///
|
||||
/// Makes a readonly region to catch up to leader region changes.
|
||||
/// There is no effect if it operating on a leader region.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct RegionCatchupRequest {
|
||||
/// Sets it to writable if it's available after it has caught up with all changes.
|
||||
pub set_writable: bool,
|
||||
@@ -1371,6 +1371,8 @@ pub struct RegionCatchupRequest {
|
||||
pub metadata_entry_id: Option<entry::Id>,
|
||||
/// The hint for replaying memtable.
|
||||
pub location_id: Option<u64>,
|
||||
/// Replay checkpoint.
|
||||
pub checkpoint: Option<ReplayCheckpoint>,
|
||||
}
|
||||
|
||||
/// Get sequences of regions by region ids.
|
||||
|
||||
@@ -14,13 +14,17 @@ Affected Rows: 0
|
||||
INSERT INTO
|
||||
integers (host, i, ts)
|
||||
VALUES
|
||||
('220-A', 2, '2023-01-01 00:00:00'),
|
||||
('220-B', 3, '2023-01-01 00:00:00'),
|
||||
('550-A', 1, '2023-01-01 00:00:00'),
|
||||
('550-B', 5, '2023-01-01 00:00:00'),
|
||||
('550-A', 2, '2023-01-01 01:00:00'),
|
||||
('550-W', 3, '2023-01-01 02:00:00'),
|
||||
('550-W', 4, '2023-01-01 03:00:00');
|
||||
('550-Z', 4, '2023-01-01 02:00:00'),
|
||||
('550-W', 5, '2023-01-01 03:00:00'),
|
||||
('550-Z', 6, '2023-01-01 03:00:00');
|
||||
|
||||
Affected Rows: 5
|
||||
Affected Rows: 9
|
||||
|
||||
SELECT
|
||||
count(i),
|
||||
@@ -33,7 +37,7 @@ FROM
|
||||
+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
| count(integers.i) | sum(integers.i) | uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)) | hll_count(hll(integers.i)) |
|
||||
+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
| 5 | 15 | 2.9742334234767016 | 5 |
|
||||
| 9 | 31 | 2.9742334234767016 | 6 |
|
||||
+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
|
||||
-- SQLNESS REPLACE (-+) -
|
||||
@@ -122,11 +126,11 @@ SELECT
|
||||
FROM
|
||||
integers;
|
||||
|
||||
+-----------------+
|
||||
| avg(integers.i) |
|
||||
+-----------------+
|
||||
| 3.0 |
|
||||
+-----------------+
|
||||
+--------------------+
|
||||
| avg(integers.i) |
|
||||
+--------------------+
|
||||
| 3.4444444444444446 |
|
||||
+--------------------+
|
||||
|
||||
-- SQLNESS REPLACE (-+) -
|
||||
-- SQLNESS REPLACE (\s\s+) _
|
||||
@@ -214,10 +218,10 @@ ORDER BY
|
||||
+---------------------+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
| ts | count(integers.i) | sum(integers.i) | uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)) | hll_count(hll(integers.i)) |
|
||||
+---------------------+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
| 2023-01-01T00:00:00 | 2 | 6 | 5.002829575110705 | 2 |
|
||||
| 2023-01-01T00:00:00 | 4 | 11 | 2.9742334234767016 | 4 |
|
||||
| 2023-01-01T01:00:00 | 1 | 2 | 1.9936617014173446 | 1 |
|
||||
| 2023-01-01T02:00:00 | 1 | 3 | 2.9742334234767016 | 1 |
|
||||
| 2023-01-01T03:00:00 | 1 | 4 | 4.014835333028587 | 1 |
|
||||
| 2023-01-01T02:00:00 | 2 | 7 | 4.014835333028587 | 2 |
|
||||
| 2023-01-01T03:00:00 | 2 | 11 | 5.98951037117262 | 2 |
|
||||
+---------------------+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
|
||||
-- SQLNESS REPLACE (-+) -
|
||||
@@ -321,6 +325,129 @@ ORDER BY
|
||||
|_|_| Total rows: 4_|
|
||||
+-+-+-+
|
||||
|
||||
SELECT
|
||||
date_bin('2s'::INTERVAL, ts) as time_window,
|
||||
count(i),
|
||||
sum(i),
|
||||
uddsketch_calc(0.5, uddsketch_state(128, 0.01, i)),
|
||||
hll_count(hll(i))
|
||||
FROM
|
||||
integers
|
||||
GROUP BY
|
||||
time_window
|
||||
ORDER BY
|
||||
time_window;
|
||||
|
||||
+---------------------+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
| time_window | count(integers.i) | sum(integers.i) | uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)) | hll_count(hll(integers.i)) |
|
||||
+---------------------+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
| 2023-01-01T00:00:00 | 4 | 11 | 2.9742334234767016 | 4 |
|
||||
| 2023-01-01T01:00:00 | 1 | 2 | 1.9936617014173446 | 1 |
|
||||
| 2023-01-01T02:00:00 | 2 | 7 | 4.014835333028587 | 2 |
|
||||
| 2023-01-01T03:00:00 | 2 | 11 | 5.98951037117262 | 2 |
|
||||
+---------------------+-------------------+-----------------+-----------------------------------------------------------------------------------+----------------------------+
|
||||
|
||||
-- SQLNESS REPLACE (-+) -
|
||||
-- SQLNESS REPLACE (\s\s+) _
|
||||
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
|
||||
-- SQLNESS REPLACE (Hash.*) REDACTED
|
||||
-- SQLNESS REPLACE (peers.*) REDACTED
|
||||
EXPLAIN
|
||||
SELECT
|
||||
date_bin('2s'::INTERVAL, ts) as time_window,
|
||||
count(i),
|
||||
sum(i),
|
||||
uddsketch_calc(0.5, uddsketch_state(128, 0.01, i)),
|
||||
hll_count(hll(i))
|
||||
FROM
|
||||
integers
|
||||
GROUP BY
|
||||
time_window
|
||||
ORDER BY
|
||||
time_window;
|
||||
|
||||
+-+-+
|
||||
| plan_type_| plan_|
|
||||
+-+-+
|
||||
| logical_plan_| Sort: time_window ASC NULLS LAST_|
|
||||
|_|_Projection: date_bin(Utf8("2 seconds"),integers.ts) AS time_window, count(integers.i), sum(integers.i), uddsketch_calc(Float64(0.5), uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_count(hll(integers.i))_|
|
||||
|_|_Aggregate: groupBy=[[date_bin(Utf8("2 seconds"),integers.ts)]], aggr=[[__count_merge(__count_state(integers.i)) AS count(integers.i), __sum_merge(__sum_state(integers.i)) AS sum(integers.i), __uddsketch_state_merge(__uddsketch_state_state(Int64(128),Float64(0.01),integers.i)) AS uddsketch_state(Int64(128),Float64(0.01),integers.i), __hll_merge(__hll_state(integers.i)) AS hll(integers.i)]] |
|
||||
|_|_MergeScan [is_placeholder=false, remote_input=[_|
|
||||
|_| Aggregate: groupBy=[[date_bin(CAST(Utf8("2 seconds") AS Interval(MonthDayNano)), integers.ts)]], aggr=[[__count_state(integers.i), __sum_state(integers.i), __uddsketch_state_state(Int64(128), Float64(0.01), CAST(integers.i AS Float64)), __hll_state(CAST(integers.i AS Utf8))]]_|
|
||||
|_|_TableScan: integers_|
|
||||
|_| ]]_|
|
||||
| physical_plan | SortPreservingMergeExec: [time_window@0 ASC NULLS LAST]_|
|
||||
|_|_SortExec: expr=[time_window@0 ASC NULLS LAST], preserve_partitioning=[true]_|
|
||||
|_|_ProjectionExec: expr=[date_bin(Utf8("2 seconds"),integers.ts)@0 as time_window, count(integers.i)@1 as count(integers.i), sum(integers.i)@2 as sum(integers.i), uddsketch_calc(0.5, uddsketch_state(Int64(128),Float64(0.01),integers.i)@3) as uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_count(hll(integers.i)@4) as hll_count(hll(integers.i))]_|
|
||||
|_|_AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("2 seconds"),integers.ts)@0 as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[count(integers.i), sum(integers.i), uddsketch_state(Int64(128),Float64(0.01),integers.i), hll(integers.i)]_|
|
||||
|_|_CoalesceBatchesExec: target_batch_size=8192_|
|
||||
|_|_RepartitionExec: partitioning=REDACTED
|
||||
|_|_AggregateExec: mode=Partial, gby=[date_bin(Utf8("2 seconds"),integers.ts)@0 as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[count(integers.i), sum(integers.i), uddsketch_state(Int64(128),Float64(0.01),integers.i), hll(integers.i)]_|
|
||||
|_|_CooperativeExec_|
|
||||
|_|_MergeScanExec: REDACTED
|
||||
|_|_|
|
||||
+-+-+
|
||||
|
||||
-- SQLNESS REPLACE (metrics.*) REDACTED
|
||||
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
|
||||
-- SQLNESS REPLACE (-+) -
|
||||
-- SQLNESS REPLACE (\s\s+) _
|
||||
-- SQLNESS REPLACE (peers.*) REDACTED
|
||||
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
|
||||
-- might write to different partitions
|
||||
-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED
|
||||
-- SQLNESS REPLACE (Hash.*) REDACTED
|
||||
EXPLAIN ANALYZE
|
||||
SELECT
|
||||
date_bin('2s'::INTERVAL, ts) as time_window,
|
||||
count(i),
|
||||
sum(i),
|
||||
uddsketch_calc(0.5, uddsketch_state(128, 0.01, i)),
|
||||
hll_count(hll(i))
|
||||
FROM
|
||||
integers
|
||||
GROUP BY
|
||||
time_window
|
||||
ORDER BY
|
||||
time_window;
|
||||
|
||||
+-+-+-+
|
||||
| stage | node | plan_|
|
||||
+-+-+-+
|
||||
| 0_| 0_|_SortPreservingMergeExec: [time_window@0 ASC NULLS LAST] REDACTED
|
||||
|_|_|_SortExec: expr=[time_window@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
|
||||
|_|_|_ProjectionExec: expr=[date_bin(Utf8("2 seconds"),integers.ts)@0 as time_window, count(integers.i)@1 as count(integers.i), sum(integers.i)@2 as sum(integers.i), uddsketch_calc(0.5, uddsketch_state(Int64(128),Float64(0.01),integers.i)@3) as uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_count(hll(integers.i)@4) as hll_count(hll(integers.i))] REDACTED
|
||||
|_|_|_AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("2 seconds"),integers.ts)@0 as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[count(integers.i), sum(integers.i), uddsketch_state(Int64(128),Float64(0.01),integers.i), hll(integers.i)] REDACTED
|
||||
|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
|
||||
|_|_|_RepartitionExec: partitioning=REDACTED
|
||||
|_|_|_AggregateExec: mode=Partial, gby=[date_bin(Utf8("2 seconds"),integers.ts)@0 as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[count(integers.i), sum(integers.i), uddsketch_state(Int64(128),Float64(0.01),integers.i), hll(integers.i)] REDACTED
|
||||
|_|_|_CooperativeExec REDACTED
|
||||
|_|_|_MergeScanExec: REDACTED
|
||||
|_|_|_|
|
||||
| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("2 seconds"),integers.ts)@0 as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[__count_state(integers.i), __sum_state(integers.i), __uddsketch_state_state(Int64(128),Float64(0.01),integers.i), __hll_state(integers.i)] REDACTED
|
||||
|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
|
||||
|_|_|_RepartitionExec: partitioning=REDACTED
|
||||
|_|_|_AggregateExec: mode=Partial, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 2000000000 }, ts@1) as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[__count_state(integers.i), __sum_state(integers.i), __uddsketch_state_state(Int64(128),Float64(0.01),integers.i), __hll_state(integers.i)] REDACTED
|
||||
|_|_|_CooperativeExec REDACTED
|
||||
|_|_|_SeqScan: region=REDACTED, "partition_count":REDACTED REDACTED
|
||||
|_|_|_|
|
||||
| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("2 seconds"),integers.ts)@0 as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[__count_state(integers.i), __sum_state(integers.i), __uddsketch_state_state(Int64(128),Float64(0.01),integers.i), __hll_state(integers.i)] REDACTED
|
||||
|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
|
||||
|_|_|_RepartitionExec: partitioning=REDACTED
|
||||
|_|_|_AggregateExec: mode=Partial, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 2000000000 }, ts@1) as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[__count_state(integers.i), __sum_state(integers.i), __uddsketch_state_state(Int64(128),Float64(0.01),integers.i), __hll_state(integers.i)] REDACTED
|
||||
|_|_|_CooperativeExec REDACTED
|
||||
|_|_|_SeqScan: region=REDACTED, "partition_count":REDACTED REDACTED
|
||||
|_|_|_|
|
||||
| 1_| 2_|_AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("2 seconds"),integers.ts)@0 as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[__count_state(integers.i), __sum_state(integers.i), __uddsketch_state_state(Int64(128),Float64(0.01),integers.i), __hll_state(integers.i)] REDACTED
|
||||
|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
|
||||
|_|_|_RepartitionExec: partitioning=REDACTED
|
||||
|_|_|_AggregateExec: mode=Partial, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 2000000000 }, ts@1) as date_bin(Utf8("2 seconds"),integers.ts)], aggr=[__count_state(integers.i), __sum_state(integers.i), __uddsketch_state_state(Int64(128),Float64(0.01),integers.i), __hll_state(integers.i)] REDACTED
|
||||
|_|_|_CooperativeExec REDACTED
|
||||
|_|_|_SeqScan: region=REDACTED, "partition_count":REDACTED REDACTED
|
||||
|_|_|_|
|
||||
|_|_| Total rows: 4_|
|
||||
+-+-+-+
|
||||
|
||||
DROP TABLE integers;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
@@ -12,11 +12,15 @@ CREATE TABLE integers(
|
||||
INSERT INTO
|
||||
integers (host, i, ts)
|
||||
VALUES
|
||||
('220-A', 2, '2023-01-01 00:00:00'),
|
||||
('220-B', 3, '2023-01-01 00:00:00'),
|
||||
('550-A', 1, '2023-01-01 00:00:00'),
|
||||
('550-B', 5, '2023-01-01 00:00:00'),
|
||||
('550-A', 2, '2023-01-01 01:00:00'),
|
||||
('550-W', 3, '2023-01-01 02:00:00'),
|
||||
('550-W', 4, '2023-01-01 03:00:00');
|
||||
('550-Z', 4, '2023-01-01 02:00:00'),
|
||||
('550-W', 5, '2023-01-01 03:00:00'),
|
||||
('550-Z', 6, '2023-01-01 03:00:00');
|
||||
|
||||
SELECT
|
||||
count(i),
|
||||
@@ -142,4 +146,60 @@ GROUP BY
|
||||
ORDER BY
|
||||
ts;
|
||||
|
||||
|
||||
SELECT
|
||||
date_bin('2s'::INTERVAL, ts) as time_window,
|
||||
count(i),
|
||||
sum(i),
|
||||
uddsketch_calc(0.5, uddsketch_state(128, 0.01, i)),
|
||||
hll_count(hll(i))
|
||||
FROM
|
||||
integers
|
||||
GROUP BY
|
||||
time_window
|
||||
ORDER BY
|
||||
time_window;
|
||||
|
||||
-- SQLNESS REPLACE (-+) -
|
||||
-- SQLNESS REPLACE (\s\s+) _
|
||||
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
|
||||
-- SQLNESS REPLACE (Hash.*) REDACTED
|
||||
-- SQLNESS REPLACE (peers.*) REDACTED
|
||||
EXPLAIN
|
||||
SELECT
|
||||
date_bin('2s'::INTERVAL, ts) as time_window,
|
||||
count(i),
|
||||
sum(i),
|
||||
uddsketch_calc(0.5, uddsketch_state(128, 0.01, i)),
|
||||
hll_count(hll(i))
|
||||
FROM
|
||||
integers
|
||||
GROUP BY
|
||||
time_window
|
||||
ORDER BY
|
||||
time_window;
|
||||
|
||||
-- SQLNESS REPLACE (metrics.*) REDACTED
|
||||
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
|
||||
-- SQLNESS REPLACE (-+) -
|
||||
-- SQLNESS REPLACE (\s\s+) _
|
||||
-- SQLNESS REPLACE (peers.*) REDACTED
|
||||
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
|
||||
-- might write to different partitions
|
||||
-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED
|
||||
-- SQLNESS REPLACE (Hash.*) REDACTED
|
||||
EXPLAIN ANALYZE
|
||||
SELECT
|
||||
date_bin('2s'::INTERVAL, ts) as time_window,
|
||||
count(i),
|
||||
sum(i),
|
||||
uddsketch_calc(0.5, uddsketch_state(128, 0.01, i)),
|
||||
hll_count(hll(i))
|
||||
FROM
|
||||
integers
|
||||
GROUP BY
|
||||
time_window
|
||||
ORDER BY
|
||||
time_window;
|
||||
|
||||
DROP TABLE integers;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user