Compare commits

..

1 Commits

Author SHA1 Message Date
discord9
13582c9efb bytes trace
Signed-off-by: discord9 <discord9@163.com>
2025-11-04 11:19:07 +08:00
251 changed files with 2670 additions and 10963 deletions

View File

@@ -613,9 +613,6 @@ jobs:
- name: "MySQL Kvbackend" - name: "MySQL Kvbackend"
opts: "--setup-mysql" opts: "--setup-mysql"
kafka: false kafka: false
- name: "Flat format"
opts: "--enable-flat-format"
kafka: false
timeout-minutes: 60 timeout-minutes: 60
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -811,7 +808,7 @@ jobs:
- name: Setup external services - name: Setup external services
working-directory: tests-integration/fixtures working-directory: tests-integration/fixtures
run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
- name: Run nextest cases - name: Run nextest cases
run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend
env: env:

View File

@@ -92,6 +92,5 @@ jobs:
mode: mode:
- name: "Basic" - name: "Basic"
- name: "Remote WAL" - name: "Remote WAL"
- name: "Flat format"
steps: steps:
- run: 'echo "No action required"' - run: 'echo "No action required"'

260
Cargo.lock generated
View File

@@ -212,9 +212,8 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
[[package]] [[package]]
name = "api" name = "api"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"arrow-schema",
"common-base", "common-base",
"common-decimal", "common-decimal",
"common-error", "common-error",
@@ -733,7 +732,7 @@ dependencies = [
[[package]] [[package]]
name = "auth" name = "auth"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -1337,9 +1336,13 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]] [[package]]
name = "bytes" name = "bytes"
version = "1.10.1" version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/discord9/bytes?rev=1572ab22c3cbad0e9b6681d1f68eca4139322a2a#1572ab22c3cbad0e9b6681d1f68eca4139322a2a"
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
dependencies = [ dependencies = [
"backtrace",
"crossbeam-channel",
"inferno 0.12.2",
"papaya",
"quanta",
"serde", "serde",
] ]
@@ -1383,7 +1386,7 @@ dependencies = [
[[package]] [[package]]
name = "cache" name = "cache"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"catalog", "catalog",
"common-error", "common-error",
@@ -1418,7 +1421,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]] [[package]]
name = "catalog" name = "catalog"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arrow", "arrow",
@@ -1630,7 +1633,6 @@ dependencies = [
"chrono", "chrono",
"chrono-tz-build", "chrono-tz-build",
"phf 0.11.3", "phf 0.11.3",
"uncased",
] ]
[[package]] [[package]]
@@ -1641,8 +1643,6 @@ checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402"
dependencies = [ dependencies = [
"parse-zoneinfo", "parse-zoneinfo",
"phf_codegen 0.11.3", "phf_codegen 0.11.3",
"phf_shared 0.11.3",
"uncased",
] ]
[[package]] [[package]]
@@ -1763,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
[[package]] [[package]]
name = "cli" name = "cli"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"async-trait", "async-trait",
@@ -1816,7 +1816,7 @@ dependencies = [
[[package]] [[package]]
name = "client" name = "client"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arc-swap", "arc-swap",
@@ -1848,8 +1848,8 @@ dependencies = [
"serde_json", "serde_json",
"snafu 0.8.6", "snafu 0.8.6",
"store-api", "store-api",
"substrait 0.18.0",
"substrait 0.37.3", "substrait 0.37.3",
"substrait 1.0.0-beta.1",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tonic 0.13.1", "tonic 0.13.1",
@@ -1889,7 +1889,7 @@ dependencies = [
[[package]] [[package]]
name = "cmd" name = "cmd"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"auth", "auth",
@@ -2012,7 +2012,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]] [[package]]
name = "common-base" name = "common-base"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"anymap2", "anymap2",
"async-trait", "async-trait",
@@ -2036,14 +2036,14 @@ dependencies = [
[[package]] [[package]]
name = "common-catalog" name = "common-catalog"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"const_format", "const_format",
] ]
[[package]] [[package]]
name = "common-config" name = "common-config"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-base", "common-base",
"common-error", "common-error",
@@ -2067,7 +2067,7 @@ dependencies = [
[[package]] [[package]]
name = "common-datasource" name = "common-datasource"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-schema", "arrow-schema",
@@ -2102,7 +2102,7 @@ dependencies = [
[[package]] [[package]]
name = "common-decimal" name = "common-decimal"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"bigdecimal 0.4.8", "bigdecimal 0.4.8",
"common-error", "common-error",
@@ -2115,7 +2115,7 @@ dependencies = [
[[package]] [[package]]
name = "common-error" name = "common-error"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-macro", "common-macro",
"http 1.3.1", "http 1.3.1",
@@ -2126,7 +2126,7 @@ dependencies = [
[[package]] [[package]]
name = "common-event-recorder" name = "common-event-recorder"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -2148,7 +2148,7 @@ dependencies = [
[[package]] [[package]]
name = "common-frontend" name = "common-frontend"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -2170,7 +2170,7 @@ dependencies = [
[[package]] [[package]]
name = "common-function" name = "common-function"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"ahash 0.8.12", "ahash 0.8.12",
"api", "api",
@@ -2229,7 +2229,7 @@ dependencies = [
[[package]] [[package]]
name = "common-greptimedb-telemetry" name = "common-greptimedb-telemetry"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"common-runtime", "common-runtime",
@@ -2246,7 +2246,7 @@ dependencies = [
[[package]] [[package]]
name = "common-grpc" name = "common-grpc"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arrow-flight", "arrow-flight",
@@ -2279,7 +2279,7 @@ dependencies = [
[[package]] [[package]]
name = "common-grpc-expr" name = "common-grpc-expr"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"common-base", "common-base",
@@ -2299,7 +2299,7 @@ dependencies = [
[[package]] [[package]]
name = "common-macro" name = "common-macro"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"greptime-proto", "greptime-proto",
"once_cell", "once_cell",
@@ -2310,7 +2310,7 @@ dependencies = [
[[package]] [[package]]
name = "common-mem-prof" name = "common-mem-prof"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"common-error", "common-error",
@@ -2326,7 +2326,7 @@ dependencies = [
[[package]] [[package]]
name = "common-meta" name = "common-meta"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"anymap2", "anymap2",
"api", "api",
@@ -2398,7 +2398,7 @@ dependencies = [
[[package]] [[package]]
name = "common-options" name = "common-options"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-grpc", "common-grpc",
"humantime-serde", "humantime-serde",
@@ -2407,11 +2407,11 @@ dependencies = [
[[package]] [[package]]
name = "common-plugins" name = "common-plugins"
version = "1.0.0-beta.1" version = "0.18.0"
[[package]] [[package]]
name = "common-pprof" name = "common-pprof"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-error", "common-error",
"common-macro", "common-macro",
@@ -2423,7 +2423,7 @@ dependencies = [
[[package]] [[package]]
name = "common-procedure" name = "common-procedure"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-stream", "async-stream",
@@ -2452,7 +2452,7 @@ dependencies = [
[[package]] [[package]]
name = "common-procedure-test" name = "common-procedure-test"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"common-procedure", "common-procedure",
@@ -2462,7 +2462,7 @@ dependencies = [
[[package]] [[package]]
name = "common-query" name = "common-query"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -2488,7 +2488,7 @@ dependencies = [
[[package]] [[package]]
name = "common-recordbatch" name = "common-recordbatch"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"common-base", "common-base",
@@ -2512,7 +2512,7 @@ dependencies = [
[[package]] [[package]]
name = "common-runtime" name = "common-runtime"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"clap 4.5.40", "clap 4.5.40",
@@ -2541,7 +2541,7 @@ dependencies = [
[[package]] [[package]]
name = "common-session" name = "common-session"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"serde", "serde",
"strum 0.27.1", "strum 0.27.1",
@@ -2549,7 +2549,7 @@ dependencies = [
[[package]] [[package]]
name = "common-sql" name = "common-sql"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-base", "common-base",
"common-decimal", "common-decimal",
@@ -2567,7 +2567,7 @@ dependencies = [
[[package]] [[package]]
name = "common-stat" name = "common-stat"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-base", "common-base",
"common-runtime", "common-runtime",
@@ -2582,7 +2582,7 @@ dependencies = [
[[package]] [[package]]
name = "common-telemetry" name = "common-telemetry"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"backtrace", "backtrace",
"common-base", "common-base",
@@ -2611,7 +2611,7 @@ dependencies = [
[[package]] [[package]]
name = "common-test-util" name = "common-test-util"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"client", "client",
"common-grpc", "common-grpc",
@@ -2624,7 +2624,7 @@ dependencies = [
[[package]] [[package]]
name = "common-time" name = "common-time"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"arrow", "arrow",
"chrono", "chrono",
@@ -2642,7 +2642,7 @@ dependencies = [
[[package]] [[package]]
name = "common-version" name = "common-version"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"build-data", "build-data",
"cargo-manifest", "cargo-manifest",
@@ -2653,7 +2653,7 @@ dependencies = [
[[package]] [[package]]
name = "common-wal" name = "common-wal"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-base", "common-base",
"common-error", "common-error",
@@ -2676,7 +2676,7 @@ dependencies = [
[[package]] [[package]]
name = "common-workload" name = "common-workload"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"common-telemetry", "common-telemetry",
"serde", "serde",
@@ -3913,7 +3913,7 @@ dependencies = [
[[package]] [[package]]
name = "datanode" name = "datanode"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arrow-flight", "arrow-flight",
@@ -3977,7 +3977,7 @@ dependencies = [
[[package]] [[package]]
name = "datatypes" name = "datatypes"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-array", "arrow-array",
@@ -4649,7 +4649,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]] [[package]]
name = "file-engine" name = "file-engine"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -4781,7 +4781,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
[[package]] [[package]]
name = "flow" name = "flow"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arrow", "arrow",
@@ -4850,7 +4850,7 @@ dependencies = [
"sql", "sql",
"store-api", "store-api",
"strum 0.27.1", "strum 0.27.1",
"substrait 1.0.0-beta.1", "substrait 0.18.0",
"table", "table",
"tokio", "tokio",
"tonic 0.13.1", "tonic 0.13.1",
@@ -4905,7 +4905,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619"
[[package]] [[package]]
name = "frontend" name = "frontend"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arc-swap", "arc-swap",
@@ -6116,7 +6116,7 @@ dependencies = [
[[package]] [[package]]
name = "index" name = "index"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"asynchronous-codec", "asynchronous-codec",
@@ -7045,7 +7045,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]] [[package]]
name = "log-query" name = "log-query"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"chrono", "chrono",
"common-error", "common-error",
@@ -7057,7 +7057,7 @@ dependencies = [
[[package]] [[package]]
name = "log-store" name = "log-store"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"async-trait", "async-trait",
@@ -7364,7 +7364,7 @@ dependencies = [
[[package]] [[package]]
name = "meta-client" name = "meta-client"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -7392,7 +7392,7 @@ dependencies = [
[[package]] [[package]]
name = "meta-srv" name = "meta-srv"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -7490,7 +7490,7 @@ dependencies = [
[[package]] [[package]]
name = "metric-engine" name = "metric-engine"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"aquamarine", "aquamarine",
@@ -7508,7 +7508,6 @@ dependencies = [
"common-telemetry", "common-telemetry",
"common-test-util", "common-test-util",
"common-time", "common-time",
"common-wal",
"datafusion", "datafusion",
"datatypes", "datatypes",
"futures-util", "futures-util",
@@ -7585,7 +7584,7 @@ dependencies = [
[[package]] [[package]]
name = "mito-codec" name = "mito-codec"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"bytes", "bytes",
@@ -7610,7 +7609,7 @@ dependencies = [
[[package]] [[package]]
name = "mito2" name = "mito2"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"aquamarine", "aquamarine",
@@ -8348,7 +8347,7 @@ dependencies = [
[[package]] [[package]]
name = "object-store" name = "object-store"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bytes", "bytes",
@@ -8633,7 +8632,7 @@ dependencies = [
[[package]] [[package]]
name = "operator" name = "operator"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"ahash 0.8.12", "ahash 0.8.12",
"api", "api",
@@ -8691,7 +8690,7 @@ dependencies = [
"sql", "sql",
"sqlparser", "sqlparser",
"store-api", "store-api",
"substrait 1.0.0-beta.1", "substrait 0.18.0",
"table", "table",
"tokio", "tokio",
"tokio-util", "tokio-util",
@@ -8867,6 +8866,16 @@ dependencies = [
"unicode-width 0.1.14", "unicode-width 0.1.14",
] ]
[[package]]
name = "papaya"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f"
dependencies = [
"equivalent",
"seize",
]
[[package]] [[package]]
name = "parking" name = "parking"
version = "2.2.1" version = "2.2.1"
@@ -8977,7 +8986,7 @@ dependencies = [
[[package]] [[package]]
name = "partition" name = "partition"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -9276,7 +9285,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [ dependencies = [
"siphasher", "siphasher",
"uncased",
] ]
[[package]] [[package]]
@@ -9322,7 +9330,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]] [[package]]
name = "pipeline" name = "pipeline"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"ahash 0.8.12", "ahash 0.8.12",
"api", "api",
@@ -9478,7 +9486,7 @@ dependencies = [
[[package]] [[package]]
name = "plugins" name = "plugins"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"auth", "auth",
"clap 4.5.40", "clap 4.5.40",
@@ -9778,7 +9786,7 @@ dependencies = [
[[package]] [[package]]
name = "promql" name = "promql"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"ahash 0.8.12", "ahash 0.8.12",
"async-trait", "async-trait",
@@ -10061,7 +10069,7 @@ dependencies = [
[[package]] [[package]]
name = "puffin" name = "puffin"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-compression 0.4.19", "async-compression 0.4.19",
"async-trait", "async-trait",
@@ -10101,9 +10109,24 @@ dependencies = [
"variadics", "variadics",
] ]
[[package]]
name = "quanta"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
dependencies = [
"crossbeam-utils",
"libc",
"once_cell",
"raw-cpuid",
"wasi 0.11.1+wasi-snapshot-preview1",
"web-sys",
"winapi",
]
[[package]] [[package]]
name = "query" name = "query"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"ahash 0.8.12", "ahash 0.8.12",
"api", "api",
@@ -10127,7 +10150,6 @@ dependencies = [
"common-query", "common-query",
"common-recordbatch", "common-recordbatch",
"common-runtime", "common-runtime",
"common-stat",
"common-telemetry", "common-telemetry",
"common-time", "common-time",
"datafusion", "datafusion",
@@ -10170,7 +10192,7 @@ dependencies = [
"sql", "sql",
"sqlparser", "sqlparser",
"store-api", "store-api",
"substrait 1.0.0-beta.1", "substrait 0.18.0",
"table", "table",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
@@ -10401,6 +10423,15 @@ dependencies = [
"thiserror 1.0.69", "thiserror 1.0.69",
] ]
[[package]]
name = "raw-cpuid"
version = "11.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
dependencies = [
"bitflags 2.9.1",
]
[[package]] [[package]]
name = "rawpointer" name = "rawpointer"
version = "0.2.1" version = "0.2.1"
@@ -11341,6 +11372,16 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "seize"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b55fb86dfd3a2f5f76ea78310a88f96c4ea21a3031f8d212443d56123fd0521"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "semver" name = "semver"
version = "1.0.26" version = "1.0.26"
@@ -11506,7 +11547,7 @@ dependencies = [
[[package]] [[package]]
name = "servers" name = "servers"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"ahash 0.8.12", "ahash 0.8.12",
"api", "api",
@@ -11632,7 +11673,7 @@ dependencies = [
[[package]] [[package]]
name = "session" name = "session"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"ahash 0.8.12", "ahash 0.8.12",
"api", "api",
@@ -11966,7 +12007,7 @@ dependencies = [
[[package]] [[package]]
name = "sql" name = "sql"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arrow-buffer", "arrow-buffer",
@@ -12026,7 +12067,7 @@ dependencies = [
[[package]] [[package]]
name = "sqlness-runner" name = "sqlness-runner"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"clap 4.5.40", "clap 4.5.40",
@@ -12303,7 +12344,7 @@ dependencies = [
[[package]] [[package]]
name = "standalone" name = "standalone"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"catalog", "catalog",
@@ -12344,7 +12385,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]] [[package]]
name = "store-api" name = "store-api"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"aquamarine", "aquamarine",
@@ -12509,6 +12550,28 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "substrait"
version = "0.18.0"
dependencies = [
"async-trait",
"bytes",
"common-error",
"common-function",
"common-macro",
"common-telemetry",
"datafusion",
"datafusion-common",
"datafusion-expr",
"datafusion-substrait",
"datatypes",
"promql",
"prost 0.13.5",
"snafu 0.8.6",
"substrait 0.37.3",
"tokio",
]
[[package]] [[package]]
name = "substrait" name = "substrait"
version = "0.37.3" version = "0.37.3"
@@ -12555,28 +12618,6 @@ dependencies = [
"walkdir", "walkdir",
] ]
[[package]]
name = "substrait"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"bytes",
"common-error",
"common-function",
"common-macro",
"common-telemetry",
"datafusion",
"datafusion-common",
"datafusion-expr",
"datafusion-substrait",
"datatypes",
"promql",
"prost 0.13.5",
"snafu 0.8.6",
"substrait 0.37.3",
"tokio",
]
[[package]] [[package]]
name = "subtle" name = "subtle"
version = "2.6.1" version = "2.6.1"
@@ -12680,7 +12721,7 @@ dependencies = [
[[package]] [[package]]
name = "table" name = "table"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"async-trait", "async-trait",
@@ -12949,7 +12990,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
[[package]] [[package]]
name = "tests-fuzz" name = "tests-fuzz"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"arbitrary", "arbitrary",
"async-trait", "async-trait",
@@ -12993,7 +13034,7 @@ dependencies = [
[[package]] [[package]]
name = "tests-integration" name = "tests-integration"
version = "1.0.0-beta.1" version = "0.18.0"
dependencies = [ dependencies = [
"api", "api",
"arrow-flight", "arrow-flight",
@@ -13067,7 +13108,7 @@ dependencies = [
"sqlx", "sqlx",
"standalone", "standalone",
"store-api", "store-api",
"substrait 1.0.0-beta.1", "substrait 0.18.0",
"table", "table",
"tempfile", "tempfile",
"time", "time",
@@ -13977,15 +14018,6 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "uncased"
version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697"
dependencies = [
"version_check",
]
[[package]] [[package]]
name = "unescaper" name = "unescaper"
version = "0.1.6" version = "0.1.6"

View File

@@ -74,7 +74,7 @@ members = [
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "1.0.0-beta.1" version = "0.18.0"
edition = "2024" edition = "2024"
license = "Apache-2.0" license = "Apache-2.0"
@@ -118,7 +118,7 @@ bitflags = "2.4.1"
bytemuck = "1.12" bytemuck = "1.12"
bytes = { version = "1.7", features = ["serde"] } bytes = { version = "1.7", features = ["serde"] }
chrono = { version = "0.4", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] }
chrono-tz = { version = "0.10.1", features = ["case-insensitive"] } chrono-tz = "0.10.1"
clap = { version = "4.4", features = ["derive"] } clap = { version = "4.4", features = ["derive"] }
config = "0.13.0" config = "0.13.0"
const_format = "0.2" const_format = "0.2"
@@ -219,7 +219,12 @@ similar-asserts = "1.6.0"
smallvec = { version = "1", features = ["serde"] } smallvec = { version = "1", features = ["serde"] }
snafu = "0.8" snafu = "0.8"
sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] } sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] }
sqlx = { version = "0.8", default-features = false, features = ["any", "macros", "json", "runtime-tokio-rustls"] } sqlx = { version = "0.8", features = [
"runtime-tokio-rustls",
"mysql",
"postgres",
"chrono",
] }
strum = { version = "0.27", features = ["derive"] } strum = { version = "0.27", features = ["derive"] }
sysinfo = "0.33" sysinfo = "0.33"
tempfile = "3" tempfile = "3"
@@ -328,6 +333,7 @@ datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git"
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" } datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" } datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x" sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x"
bytes = { git = "https://github.com/discord9/bytes", rev = "1572ab22c3cbad0e9b6681d1f68eca4139322a2a" }
[profile.release] [profile.release]
debug = 1 debug = 1

View File

@@ -12,6 +12,7 @@
<div align="center"> <div align="center">
<h3 align="center"> <h3 align="center">
<a href="https://greptime.com/product/cloud">GreptimeCloud</a> |
<a href="https://docs.greptime.com/">User Guide</a> | <a href="https://docs.greptime.com/">User Guide</a> |
<a href="https://greptimedb.rs/">API Docs</a> | <a href="https://greptimedb.rs/">API Docs</a> |
<a href="https://github.com/GreptimeTeam/greptimedb/issues/5446">Roadmap 2025</a> <a href="https://github.com/GreptimeTeam/greptimedb/issues/5446">Roadmap 2025</a>
@@ -104,6 +105,16 @@ Read [more benchmark reports](https://docs.greptime.com/user-guide/concepts/feat
## Try GreptimeDB ## Try GreptimeDB
### 1. [Live Demo](https://greptime.com/playground)
Experience GreptimeDB directly in your browser.
### 2. [GreptimeCloud](https://console.greptime.cloud/)
Start instantly with a free cluster.
### 3. Docker (Local Quickstart)
```shell ```shell
docker pull greptime/greptimedb docker pull greptime/greptimedb
``` ```

View File

@@ -16,7 +16,7 @@
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). | | `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
| `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. | | `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. |
| `runtime` | -- | -- | The runtime options. | | `runtime` | -- | -- | The runtime options. |
@@ -104,7 +104,6 @@
| `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. | | `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
| `query` | -- | -- | The query engine options. | | `query` | -- | -- | The query engine options. |
| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. | | `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
| `storage` | -- | -- | The data storage options. | | `storage` | -- | -- | The data storage options. |
| `storage.data_home` | String | `./greptimedb_data` | The working home directory. | | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. | | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
@@ -152,13 +151,10 @@
| `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. | | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
| `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. | | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
| `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. | | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
| `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
| `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
| `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
| `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
@@ -192,7 +188,7 @@
| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. | | `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
| `region_engine.file` | -- | -- | Enable the file engine. | | `region_engine.file` | -- | -- | Enable the file engine. |
| `region_engine.metric` | -- | -- | Metric engine options. | | `region_engine.metric` | -- | -- | Metric engine options. |
| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. | | `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. |
| `logging` | -- | -- | The logging options. | | `logging` | -- | -- | The logging options. |
| `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. | | `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. | | `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
@@ -312,7 +308,6 @@
| `query` | -- | -- | The query engine options. | | `query` | -- | -- | The query engine options. |
| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. | | `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
| `query.allow_query_fallback` | Bool | `false` | Whether to allow query fallback when push down optimize fails.<br/>Default to false, meaning when push down optimize failed, return error msg | | `query.allow_query_fallback` | Bool | `false` | Whether to allow query fallback when push down optimize fails.<br/>Default to false, meaning when push down optimize failed, return error msg |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans (only applies to datanodes). |
| `datanode` | -- | -- | Datanode options. | | `datanode` | -- | -- | Datanode options. |
| `datanode.client` | -- | -- | Datanode client options. | | `datanode.client` | -- | -- | Datanode client options. |
| `datanode.client.connect_timeout` | String | `10s` | -- | | `datanode.client.connect_timeout` | String | `10s` | -- |
@@ -451,7 +446,7 @@
| `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. | | `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. |
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). | | `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
| `http` | -- | -- | The HTTP server options. | | `http` | -- | -- | The HTTP server options. |
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. | | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
@@ -505,7 +500,6 @@
| `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system<br/>can still successfully replay memtable data without throwing an<br/>out-of-range error.<br/>However, enabling this option might lead to unexpected data loss,<br/>as the system will skip over missing entries instead of treating<br/>them as critical errors. | | `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system<br/>can still successfully replay memtable data without throwing an<br/>out-of-range error.<br/>However, enabling this option might lead to unexpected data loss,<br/>as the system will skip over missing entries instead of treating<br/>them as critical errors. |
| `query` | -- | -- | The query engine options. | | `query` | -- | -- | The query engine options. |
| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. | | `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
| `storage` | -- | -- | The data storage options. | | `storage` | -- | -- | The data storage options. |
| `storage.data_home` | String | `./greptimedb_data` | The working home directory. | | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. | | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
@@ -555,13 +549,10 @@
| `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. | | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
| `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. | | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
| `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. | | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
| `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
| `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
| `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
| `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
@@ -595,7 +586,7 @@
| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. | | `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
| `region_engine.file` | -- | -- | Enable the file engine. | | `region_engine.file` | -- | -- | Enable the file engine. |
| `region_engine.metric` | -- | -- | Metric engine options. | | `region_engine.metric` | -- | -- | Metric engine options. |
| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. | | `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. |
| `logging` | -- | -- | The logging options. | | `logging` | -- | -- | The logging options. |
| `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. | | `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. | | `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
@@ -682,6 +673,5 @@
| `tracing.tokio_console_addr` | String | Unset | The tokio console address. | | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
| `query` | -- | -- | -- | | `query` | -- | -- | -- |
| `query.parallelism` | Integer | `1` | Parallelism of the query engine for query sent by flownode.<br/>Default to 1, so it won't use too much cpu or memory | | `query.parallelism` | Integer | `1` | Parallelism of the query engine for query sent by flownode.<br/>Default to 1, so it won't use too much cpu or memory |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
| `memory` | -- | -- | The memory options. | | `memory` | -- | -- | The memory options. |
| `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. | | `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |

View File

@@ -18,9 +18,6 @@ init_regions_in_background = false
init_regions_parallelism = 16 init_regions_parallelism = 16
## The maximum current queries allowed to be executed. Zero means unlimited. ## The maximum current queries allowed to be executed. Zero means unlimited.
## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
## When set, 70% of queries get privileged memory access (full scan_memory_limit).
## The remaining 30% get standard tier access (70% of scan_memory_limit).
max_concurrent_queries = 0 max_concurrent_queries = 0
## Enable telemetry to collect anonymous usage data. Enabled by default. ## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -264,13 +261,6 @@ overwrite_entry_start_id = false
## Default to 0, which means the number of CPU cores. ## Default to 0, which means the number of CPU cores.
parallelism = 0 parallelism = 0
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans.
memory_pool_size = "50%"
## The data storage options. ## The data storage options.
[storage] [storage]
## The working home directory. ## The working home directory.
@@ -499,17 +489,6 @@ write_cache_size = "5GiB"
## @toml2docs:none-default ## @toml2docs:none-default
write_cache_ttl = "8h" write_cache_ttl = "8h"
## Preload index (puffin) files into cache on region open (default: true).
## When enabled, index files are loaded into the write cache during region initialization,
## which can improve query performance at the cost of longer startup times.
preload_index_cache = true
## Percentage of write cache capacity allocated for index (puffin) files (default: 20).
## The remaining capacity is used for data (parquet) files.
## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
## 1GiB is reserved for index files and 4GiB for data files.
index_cache_percent = 20
## Buffer size for SST writing. ## Buffer size for SST writing.
sst_write_buffer_size = "8MB" sst_write_buffer_size = "8MB"
@@ -522,14 +501,6 @@ max_concurrent_scan_files = 384
## Whether to allow stale WAL entries read during replay. ## Whether to allow stale WAL entries read during replay.
allow_stale_entries = false allow_stale_entries = false
## Memory limit for table scans across all queries.
## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit.
## NOTE: Works with max_concurrent_queries for tiered memory allocation.
## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
scan_memory_limit = "50%"
## Minimum time interval between two compactions. ## Minimum time interval between two compactions.
## To align with the old behavior, the default value is 0 (no restrictions). ## To align with the old behavior, the default value is 0 (no restrictions).
min_compaction_interval = "0m" min_compaction_interval = "0m"
@@ -669,8 +640,8 @@ fork_dictionary_bytes = "1GiB"
[[region_engine]] [[region_engine]]
## Metric engine options. ## Metric engine options.
[region_engine.metric] [region_engine.metric]
## Whether to use sparse primary key encoding. ## Whether to enable the experimental sparse primary key encoding.
sparse_primary_key_encoding = true experimental_sparse_primary_key_encoding = false
## The logging options. ## The logging options.
[logging] [logging]

View File

@@ -158,13 +158,6 @@ default_ratio = 1.0
## Default to 1, so it won't use too much cpu or memory ## Default to 1, so it won't use too much cpu or memory
parallelism = 1 parallelism = 1
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans.
memory_pool_size = "50%"
## The memory options. ## The memory options.
[memory] [memory]
## Whether to enable heap profiling activation during startup. ## Whether to enable heap profiling activation during startup.

View File

@@ -256,13 +256,6 @@ parallelism = 0
## Default to false, meaning when push down optimize failed, return error msg ## Default to false, meaning when push down optimize failed, return error msg
allow_query_fallback = false allow_query_fallback = false
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans (only applies to datanodes).
memory_pool_size = "50%"
## Datanode options. ## Datanode options.
[datanode] [datanode]
## Datanode client options. ## Datanode client options.

View File

@@ -14,9 +14,6 @@ init_regions_in_background = false
init_regions_parallelism = 16 init_regions_parallelism = 16
## The maximum current queries allowed to be executed. Zero means unlimited. ## The maximum current queries allowed to be executed. Zero means unlimited.
## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
## When set, 70% of queries get privileged memory access (full scan_memory_limit).
## The remaining 30% get standard tier access (70% of scan_memory_limit).
max_concurrent_queries = 0 max_concurrent_queries = 0
## Enable telemetry to collect anonymous usage data. Enabled by default. ## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -368,13 +365,6 @@ max_running_procedures = 128
## Default to 0, which means the number of CPU cores. ## Default to 0, which means the number of CPU cores.
parallelism = 0 parallelism = 0
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans.
memory_pool_size = "50%"
## The data storage options. ## The data storage options.
[storage] [storage]
## The working home directory. ## The working home directory.
@@ -590,17 +580,6 @@ write_cache_size = "5GiB"
## @toml2docs:none-default ## @toml2docs:none-default
write_cache_ttl = "8h" write_cache_ttl = "8h"
## Preload index (puffin) files into cache on region open (default: true).
## When enabled, index files are loaded into the write cache during region initialization,
## which can improve query performance at the cost of longer startup times.
preload_index_cache = true
## Percentage of write cache capacity allocated for index (puffin) files (default: 20).
## The remaining capacity is used for data (parquet) files.
## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
## 1GiB is reserved for index files and 4GiB for data files.
index_cache_percent = 20
## Buffer size for SST writing. ## Buffer size for SST writing.
sst_write_buffer_size = "8MB" sst_write_buffer_size = "8MB"
@@ -613,14 +592,6 @@ max_concurrent_scan_files = 384
## Whether to allow stale WAL entries read during replay. ## Whether to allow stale WAL entries read during replay.
allow_stale_entries = false allow_stale_entries = false
## Memory limit for table scans across all queries.
## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit.
## NOTE: Works with max_concurrent_queries for tiered memory allocation.
## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
scan_memory_limit = "50%"
## Minimum time interval between two compactions. ## Minimum time interval between two compactions.
## To align with the old behavior, the default value is 0 (no restrictions). ## To align with the old behavior, the default value is 0 (no restrictions).
min_compaction_interval = "0m" min_compaction_interval = "0m"
@@ -760,8 +731,8 @@ fork_dictionary_bytes = "1GiB"
[[region_engine]] [[region_engine]]
## Metric engine options. ## Metric engine options.
[region_engine.metric] [region_engine.metric]
## Whether to use sparse primary key encoding. ## Whether to enable the experimental sparse primary key encoding.
sparse_primary_key_encoding = true experimental_sparse_primary_key_encoding = false
## The logging options. ## The logging options.
[logging] [logging]

View File

@@ -92,6 +92,9 @@ curl -X POST localhost:4000/debug/prof/mem > greptime.hprof
curl -X POST "localhost:4000/debug/prof/mem?output=flamegraph" > greptime.svg curl -X POST "localhost:4000/debug/prof/mem?output=flamegraph" > greptime.svg
# or output pprof format # or output pprof format
curl -X POST "localhost:4000/debug/prof/mem?output=proto" > greptime.pprof curl -X POST "localhost:4000/debug/prof/mem?output=proto" > greptime.pprof
curl -X POST "localhost:4000/debug/prof/bytes" > greptime.svg
``` ```
You can periodically dump profiling data and compare them to find the delta memory usage. You can periodically dump profiling data and compare them to find the delta memory usage.

View File

@@ -8,7 +8,6 @@ license.workspace = true
workspace = true workspace = true
[dependencies] [dependencies]
arrow-schema.workspace = true
common-base.workspace = true common-base.workspace = true
common-decimal.workspace = true common-decimal.workspace = true
common-error.workspace = true common-error.workspace = true

View File

@@ -14,11 +14,10 @@
use std::collections::HashMap; use std::collections::HashMap;
use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};
use datatypes::schema::{ use datatypes::schema::{
COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer, COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer,
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, JSON_STRUCTURE_SETTINGS_KEY,
SkippingIndexType, SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType,
}; };
use greptime_proto::v1::{ use greptime_proto::v1::{
Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType, Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType,
@@ -69,14 +68,8 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) { if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) {
metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned()); metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned());
} }
if let Some(extension_name) = options.options.get(EXTENSION_TYPE_NAME_KEY) { if let Some(settings) = options.options.get(JSON_STRUCTURE_SETTINGS_KEY) {
metadata.insert(EXTENSION_TYPE_NAME_KEY.to_string(), extension_name.clone()); metadata.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
}
if let Some(extension_metadata) = options.options.get(EXTENSION_TYPE_METADATA_KEY) {
metadata.insert(
EXTENSION_TYPE_METADATA_KEY.to_string(),
extension_metadata.clone(),
);
} }
} }
@@ -149,16 +142,10 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<Column
.options .options
.insert(SKIPPING_INDEX_GRPC_KEY.to_string(), skipping_index.clone()); .insert(SKIPPING_INDEX_GRPC_KEY.to_string(), skipping_index.clone());
} }
if let Some(extension_name) = column_schema.metadata().get(EXTENSION_TYPE_NAME_KEY) { if let Some(settings) = column_schema.metadata().get(JSON_STRUCTURE_SETTINGS_KEY) {
options options
.options .options
.insert(EXTENSION_TYPE_NAME_KEY.to_string(), extension_name.clone()); .insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
}
if let Some(extension_metadata) = column_schema.metadata().get(EXTENSION_TYPE_METADATA_KEY) {
options.options.insert(
EXTENSION_TYPE_METADATA_KEY.to_string(),
extension_metadata.clone(),
);
} }
(!options.options.is_empty()).then_some(options) (!options.options.is_empty()).then_some(options)

View File

@@ -97,6 +97,7 @@ lazy_static! {
ROUTINES, ROUTINES,
SCHEMA_PRIVILEGES, SCHEMA_PRIVILEGES,
TABLE_PRIVILEGES, TABLE_PRIVILEGES,
TRIGGERS,
GLOBAL_STATUS, GLOBAL_STATUS,
SESSION_STATUS, SESSION_STATUS,
PARTITIONS, PARTITIONS,
@@ -206,6 +207,7 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
ROUTINES => setup_memory_table!(ROUTINES), ROUTINES => setup_memory_table!(ROUTINES),
SCHEMA_PRIVILEGES => setup_memory_table!(SCHEMA_PRIVILEGES), SCHEMA_PRIVILEGES => setup_memory_table!(SCHEMA_PRIVILEGES),
TABLE_PRIVILEGES => setup_memory_table!(TABLE_PRIVILEGES), TABLE_PRIVILEGES => setup_memory_table!(TABLE_PRIVILEGES),
TRIGGERS => setup_memory_table!(TRIGGERS),
GLOBAL_STATUS => setup_memory_table!(GLOBAL_STATUS), GLOBAL_STATUS => setup_memory_table!(GLOBAL_STATUS),
SESSION_STATUS => setup_memory_table!(SESSION_STATUS), SESSION_STATUS => setup_memory_table!(SESSION_STATUS),
KEY_COLUMN_USAGE => Some(Arc::new(InformationSchemaKeyColumnUsage::new( KEY_COLUMN_USAGE => Some(Arc::new(InformationSchemaKeyColumnUsage::new(

View File

@@ -15,7 +15,8 @@
use std::sync::Arc; use std::sync::Arc;
use common_catalog::consts::{METRIC_ENGINE, MITO_ENGINE}; use common_catalog::consts::{METRIC_ENGINE, MITO_ENGINE};
use datatypes::schema::{Schema, SchemaRef}; use datatypes::data_type::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use datatypes::vectors::{Int64Vector, StringVector, VectorRef}; use datatypes::vectors::{Int64Vector, StringVector, VectorRef};
use crate::system_schema::information_schema::table_names::*; use crate::system_schema::information_schema::table_names::*;
@@ -365,6 +366,16 @@ pub(super) fn get_schema_columns(table_name: &str) -> (SchemaRef, Vec<VectorRef>
vec![], vec![],
), ),
TRIGGERS => (
vec![
string_column("TRIGGER_NAME"),
ColumnSchema::new("trigger_id", ConcreteDataType::uint64_datatype(), false),
string_column("TRIGGER_DEFINITION"),
ColumnSchema::new("flownode_id", ConcreteDataType::uint64_datatype(), true),
],
vec![],
),
// TODO: Considering store internal metrics in `global_status` and // TODO: Considering store internal metrics in `global_status` and
// `session_status` tables. // `session_status` tables.
GLOBAL_STATUS => ( GLOBAL_STATUS => (

View File

@@ -23,8 +23,6 @@ use crate::Tool;
use crate::data::export::ExportCommand; use crate::data::export::ExportCommand;
use crate::data::import::ImportCommand; use crate::data::import::ImportCommand;
pub(crate) const COPY_PATH_PLACEHOLDER: &str = "<PATH/TO/FILES>";
/// Command for data operations including exporting data from and importing data into GreptimeDB. /// Command for data operations including exporting data from and importing data into GreptimeDB.
#[derive(Subcommand)] #[derive(Subcommand)]
pub enum DataCommand { pub enum DataCommand {

View File

@@ -30,7 +30,7 @@ use snafu::{OptionExt, ResultExt};
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
use tokio::time::Instant; use tokio::time::Instant;
use crate::data::{COPY_PATH_PLACEHOLDER, default_database}; use crate::data::default_database;
use crate::database::{DatabaseClient, parse_proxy_opts}; use crate::database::{DatabaseClient, parse_proxy_opts};
use crate::error::{ use crate::error::{
EmptyResultSnafu, Error, OpenDalSnafu, OutputDirNotSetSnafu, Result, S3ConfigNotSetSnafu, EmptyResultSnafu, Error, OpenDalSnafu, OutputDirNotSetSnafu, Result, S3ConfigNotSetSnafu,
@@ -668,26 +668,10 @@ impl Export {
); );
// Create copy_from.sql file // Create copy_from.sql file
let copy_database_from_sql = { let copy_database_from_sql = format!(
let command_without_connection = format!( r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#,
r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({});"#, export_self.catalog, schema, path, with_options_clone, connection_part
export_self.catalog, schema, COPY_PATH_PLACEHOLDER, with_options_clone );
);
if connection_part.is_empty() {
command_without_connection
} else {
let command_with_connection = format!(
r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#,
export_self.catalog, schema, path, with_options_clone, connection_part
);
format!(
"-- {}\n{}",
command_with_connection, command_without_connection
)
}
};
let copy_from_path = export_self.get_file_path(&schema, "copy_from.sql"); let copy_from_path = export_self.get_file_path(&schema, "copy_from.sql");
export_self export_self

View File

@@ -21,13 +21,13 @@ use clap::{Parser, ValueEnum};
use common_catalog::consts::DEFAULT_SCHEMA_NAME; use common_catalog::consts::DEFAULT_SCHEMA_NAME;
use common_error::ext::BoxedError; use common_error::ext::BoxedError;
use common_telemetry::{error, info, warn}; use common_telemetry::{error, info, warn};
use snafu::{OptionExt, ResultExt, ensure}; use snafu::{OptionExt, ResultExt};
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
use tokio::time::Instant; use tokio::time::Instant;
use crate::data::{COPY_PATH_PLACEHOLDER, default_database}; use crate::data::default_database;
use crate::database::{DatabaseClient, parse_proxy_opts}; use crate::database::{DatabaseClient, parse_proxy_opts};
use crate::error::{Error, FileIoSnafu, InvalidArgumentsSnafu, Result, SchemaNotFoundSnafu}; use crate::error::{Error, FileIoSnafu, Result, SchemaNotFoundSnafu};
use crate::{Tool, database}; use crate::{Tool, database};
#[derive(Debug, Default, Clone, ValueEnum)] #[derive(Debug, Default, Clone, ValueEnum)]
@@ -148,15 +148,12 @@ impl Import {
let _permit = semaphore_moved.acquire().await.unwrap(); let _permit = semaphore_moved.acquire().await.unwrap();
let database_input_dir = self.catalog_path().join(&schema); let database_input_dir = self.catalog_path().join(&schema);
let sql_file = database_input_dir.join(filename); let sql_file = database_input_dir.join(filename);
let mut sql = tokio::fs::read_to_string(sql_file) let sql = tokio::fs::read_to_string(sql_file)
.await .await
.context(FileIoSnafu)?; .context(FileIoSnafu)?;
if sql.trim().is_empty() { if sql.is_empty() {
info!("Empty `{filename}` {database_input_dir:?}"); info!("Empty `{filename}` {database_input_dir:?}");
} else { } else {
if filename == "copy_from.sql" {
sql = self.rewrite_copy_database_sql(&schema, &sql)?;
}
let db = exec_db.unwrap_or(&schema); let db = exec_db.unwrap_or(&schema);
self.database_client.sql(&sql, db).await?; self.database_client.sql(&sql, db).await?;
info!("Imported `{filename}` for database {schema}"); info!("Imported `{filename}` for database {schema}");
@@ -229,57 +226,6 @@ impl Import {
} }
Ok(db_names) Ok(db_names)
} }
fn rewrite_copy_database_sql(&self, schema: &str, sql: &str) -> Result<String> {
let target_location = self.build_copy_database_location(schema);
let escaped_location = target_location.replace('\'', "''");
let mut first_stmt_checked = false;
for line in sql.lines() {
let trimmed = line.trim_start();
if trimmed.is_empty() || trimmed.starts_with("--") {
continue;
}
ensure!(
trimmed.starts_with("COPY DATABASE"),
InvalidArgumentsSnafu {
msg: "Expected COPY DATABASE statement at start of copy_from.sql"
}
);
first_stmt_checked = true;
break;
}
ensure!(
first_stmt_checked,
InvalidArgumentsSnafu {
msg: "COPY DATABASE statement not found in copy_from.sql"
}
);
ensure!(
sql.contains(COPY_PATH_PLACEHOLDER),
InvalidArgumentsSnafu {
msg: format!(
"Placeholder `{}` not found in COPY DATABASE statement",
COPY_PATH_PLACEHOLDER
)
}
);
Ok(sql.replacen(COPY_PATH_PLACEHOLDER, &escaped_location, 1))
}
fn build_copy_database_location(&self, schema: &str) -> String {
let mut path = self.catalog_path();
path.push(schema);
let mut path_str = path.to_string_lossy().into_owned();
if !path_str.ends_with('/') {
path_str.push('/');
}
path_str
}
} }
#[async_trait] #[async_trait]
@@ -295,52 +241,3 @@ impl Tool for Import {
} }
} }
} }
#[cfg(test)]
mod tests {
use std::time::Duration;
use super::*;
fn build_import(input_dir: &str) -> Import {
Import {
catalog: "catalog".to_string(),
schema: None,
database_client: DatabaseClient::new(
"127.0.0.1:4000".to_string(),
"catalog".to_string(),
None,
Duration::from_secs(0),
None,
),
input_dir: input_dir.to_string(),
parallelism: 1,
target: ImportTarget::Data,
}
}
#[test]
fn rewrite_copy_database_sql_replaces_placeholder() {
let import = build_import("/tmp/export-path");
let comment = "-- COPY DATABASE \"catalog\".\"schema\" FROM 's3://bucket/demo/' WITH (format = 'parquet') CONNECTION (region = 'us-west-2')";
let sql = format!(
"{comment}\nCOPY DATABASE \"catalog\".\"schema\" FROM '{}' WITH (format = 'parquet');",
COPY_PATH_PLACEHOLDER
);
let rewritten = import.rewrite_copy_database_sql("schema", &sql).unwrap();
let expected_location = import.build_copy_database_location("schema");
let escaped = expected_location.replace('\'', "''");
assert!(rewritten.starts_with(comment));
assert!(rewritten.contains(&format!("FROM '{escaped}'")));
assert!(!rewritten.contains(COPY_PATH_PLACEHOLDER));
}
#[test]
fn rewrite_copy_database_sql_requires_placeholder() {
let import = build_import("/tmp/export-path");
let sql = "COPY DATABASE \"catalog\".\"schema\" FROM '/tmp/export-path/catalog/schema/' WITH (format = 'parquet');";
assert!(import.rewrite_copy_database_sql("schema", sql).is_err());
}
}

View File

@@ -20,9 +20,7 @@ use api::v1::health_check_client::HealthCheckClient;
use api::v1::prometheus_gateway_client::PrometheusGatewayClient; use api::v1::prometheus_gateway_client::PrometheusGatewayClient;
use api::v1::region::region_client::RegionClient as PbRegionClient; use api::v1::region::region_client::RegionClient as PbRegionClient;
use arrow_flight::flight_service_client::FlightServiceClient; use arrow_flight::flight_service_client::FlightServiceClient;
use common_grpc::channel_manager::{ use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption};
ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
};
use parking_lot::RwLock; use parking_lot::RwLock;
use snafu::{OptionExt, ResultExt}; use snafu::{OptionExt, ResultExt};
use tonic::codec::CompressionEncoding; use tonic::codec::CompressionEncoding;
@@ -96,9 +94,8 @@ impl Client {
A: AsRef<[U]>, A: AsRef<[U]>,
{ {
let channel_config = ChannelConfig::default().client_tls_config(client_tls); let channel_config = ChannelConfig::default().client_tls_config(client_tls);
let tls_config = load_tls_config(channel_config.client_tls.as_ref()) let channel_manager = ChannelManager::with_tls_config(channel_config)
.context(error::CreateTlsChannelSnafu)?; .context(error::CreateTlsChannelSnafu)?;
let channel_manager = ChannelManager::with_config(channel_config, tls_config);
Ok(Self::with_manager_and_urls(channel_manager, urls)) Ok(Self::with_manager_and_urls(channel_manager, urls))
} }

View File

@@ -74,7 +74,7 @@ impl FlownodeManager for NodeClients {
impl NodeClients { impl NodeClients {
pub fn new(config: ChannelConfig) -> Self { pub fn new(config: ChannelConfig) -> Self {
Self { Self {
channel_manager: ChannelManager::with_config(config, None), channel_manager: ChannelManager::with_config(config),
clients: CacheBuilder::new(1024) clients: CacheBuilder::new(1024)
.time_to_live(Duration::from_secs(30 * 60)) .time_to_live(Duration::from_secs(30 * 60))
.time_to_idle(Duration::from_secs(5 * 60)) .time_to_idle(Duration::from_secs(5 * 60))

View File

@@ -162,7 +162,6 @@ impl ObjbenchCommand {
file_size, file_size,
available_indexes: Default::default(), available_indexes: Default::default(),
index_file_size: 0, index_file_size: 0,
index_file_id: None,
num_rows, num_rows,
num_row_groups, num_row_groups,
sequence: None, sequence: None,

View File

@@ -15,7 +15,6 @@
use std::time::Duration; use std::time::Duration;
use cmd::options::GreptimeOptions; use cmd::options::GreptimeOptions;
use common_base::memory_limit::MemoryLimit;
use common_config::{Configurable, DEFAULT_DATA_HOME}; use common_config::{Configurable, DEFAULT_DATA_HOME};
use common_options::datanode::{ClientOptions, DatanodeClientOptions}; use common_options::datanode::{ClientOptions, DatanodeClientOptions};
use common_telemetry::logging::{DEFAULT_LOGGING_DIR, DEFAULT_OTLP_HTTP_ENDPOINT, LoggingOptions}; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, DEFAULT_OTLP_HTTP_ENDPOINT, LoggingOptions};
@@ -75,19 +74,14 @@ fn test_load_datanode_example_config() {
RegionEngineConfig::Mito(MitoConfig { RegionEngineConfig::Mito(MitoConfig {
auto_flush_interval: Duration::from_secs(3600), auto_flush_interval: Duration::from_secs(3600),
write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)), write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
scan_memory_limit: MemoryLimit::Percentage(50),
..Default::default() ..Default::default()
}), }),
RegionEngineConfig::File(FileEngineConfig {}), RegionEngineConfig::File(FileEngineConfig {}),
RegionEngineConfig::Metric(MetricEngineConfig { RegionEngineConfig::Metric(MetricEngineConfig {
sparse_primary_key_encoding: true, experimental_sparse_primary_key_encoding: false,
flush_metadata_region_interval: Duration::from_secs(30), flush_metadata_region_interval: Duration::from_secs(30),
}), }),
], ],
query: QueryOptions {
memory_pool_size: MemoryLimit::Percentage(50),
..Default::default()
},
logging: LoggingOptions { logging: LoggingOptions {
level: Some("info".to_string()), level: Some("info".to_string()),
dir: format!("{}/{}", DEFAULT_DATA_HOME, DEFAULT_LOGGING_DIR), dir: format!("{}/{}", DEFAULT_DATA_HOME, DEFAULT_LOGGING_DIR),
@@ -161,10 +155,6 @@ fn test_load_frontend_example_config() {
cors_allowed_origins: vec!["https://example.com".to_string()], cors_allowed_origins: vec!["https://example.com".to_string()],
..Default::default() ..Default::default()
}, },
query: QueryOptions {
memory_pool_size: MemoryLimit::Percentage(50),
..Default::default()
},
..Default::default() ..Default::default()
}, },
..Default::default() ..Default::default()
@@ -252,7 +242,6 @@ fn test_load_flownode_example_config() {
query: QueryOptions { query: QueryOptions {
parallelism: 1, parallelism: 1,
allow_query_fallback: false, allow_query_fallback: false,
memory_pool_size: MemoryLimit::Percentage(50),
}, },
meta_client: Some(MetaClientOptions { meta_client: Some(MetaClientOptions {
metasrv_addrs: vec!["127.0.0.1:3002".to_string()], metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
@@ -297,12 +286,11 @@ fn test_load_standalone_example_config() {
RegionEngineConfig::Mito(MitoConfig { RegionEngineConfig::Mito(MitoConfig {
auto_flush_interval: Duration::from_secs(3600), auto_flush_interval: Duration::from_secs(3600),
write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)), write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
scan_memory_limit: MemoryLimit::Percentage(50),
..Default::default() ..Default::default()
}), }),
RegionEngineConfig::File(FileEngineConfig {}), RegionEngineConfig::File(FileEngineConfig {}),
RegionEngineConfig::Metric(MetricEngineConfig { RegionEngineConfig::Metric(MetricEngineConfig {
sparse_primary_key_encoding: true, experimental_sparse_primary_key_encoding: false,
flush_metadata_region_interval: Duration::from_secs(30), flush_metadata_region_interval: Duration::from_secs(30),
}), }),
], ],
@@ -326,10 +314,7 @@ fn test_load_standalone_example_config() {
cors_allowed_origins: vec!["https://example.com".to_string()], cors_allowed_origins: vec!["https://example.com".to_string()],
..Default::default() ..Default::default()
}, },
query: QueryOptions {
memory_pool_size: MemoryLimit::Percentage(50),
..Default::default()
},
..Default::default() ..Default::default()
}, },
..Default::default() ..Default::default()

View File

@@ -15,7 +15,6 @@
pub mod bit_vec; pub mod bit_vec;
pub mod bytes; pub mod bytes;
pub mod cancellation; pub mod cancellation;
pub mod memory_limit;
pub mod plugins; pub mod plugins;
pub mod range_read; pub mod range_read;
#[allow(clippy::all)] #[allow(clippy::all)]

View File

@@ -1,265 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{self, Display};
use std::str::FromStr;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::readable_size::ReadableSize;
/// Memory limit configuration that supports both absolute size and percentage.
///
/// Examples:
/// - Absolute size: "2GB", "4GiB", "512MB"
/// - Percentage: "50%", "75%"
/// - Unlimited: "unlimited", "0"
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum MemoryLimit {
/// Absolute memory size.
Size(ReadableSize),
/// Percentage of total system memory (0-100).
Percentage(u8),
/// No memory limit.
#[default]
Unlimited,
}
impl MemoryLimit {
/// Resolve the memory limit to bytes based on total system memory.
/// Returns 0 if the limit is unlimited.
pub fn resolve(&self, total_memory_bytes: u64) -> u64 {
match self {
MemoryLimit::Size(size) => size.as_bytes(),
MemoryLimit::Percentage(pct) => total_memory_bytes * (*pct as u64) / 100,
MemoryLimit::Unlimited => 0,
}
}
/// Returns true if this limit is unlimited.
pub fn is_unlimited(&self) -> bool {
match self {
MemoryLimit::Size(size) => size.as_bytes() == 0,
MemoryLimit::Percentage(pct) => *pct == 0,
MemoryLimit::Unlimited => true,
}
}
}
impl FromStr for MemoryLimit {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim();
if s.eq_ignore_ascii_case("unlimited") {
return Ok(MemoryLimit::Unlimited);
}
if let Some(pct_str) = s.strip_suffix('%') {
let pct = pct_str
.trim()
.parse::<u8>()
.map_err(|e| format!("invalid percentage value '{}': {}", pct_str, e))?;
if pct > 100 {
return Err(format!("percentage must be between 0 and 100, got {}", pct));
}
if pct == 0 {
Ok(MemoryLimit::Unlimited)
} else {
Ok(MemoryLimit::Percentage(pct))
}
} else {
let size = ReadableSize::from_str(s)?;
if size.as_bytes() == 0 {
Ok(MemoryLimit::Unlimited)
} else {
Ok(MemoryLimit::Size(size))
}
}
}
}
impl Display for MemoryLimit {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemoryLimit::Size(size) => write!(f, "{}", size),
MemoryLimit::Percentage(pct) => write!(f, "{}%", pct),
MemoryLimit::Unlimited => write!(f, "unlimited"),
}
}
}
impl Serialize for MemoryLimit {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&self.to_string())
}
}
impl<'de> Deserialize<'de> for MemoryLimit {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
MemoryLimit::from_str(&s).map_err(serde::de::Error::custom)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_absolute_size() {
assert_eq!(
"2GB".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024))
);
assert_eq!(
"512MB".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Size(ReadableSize(512 * 1024 * 1024))
);
assert_eq!("0".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
}
#[test]
fn test_parse_percentage() {
assert_eq!(
"50%".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Percentage(50)
);
assert_eq!(
"75%".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Percentage(75)
);
assert_eq!("0%".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
}
#[test]
fn test_parse_invalid() {
assert!("150%".parse::<MemoryLimit>().is_err());
assert!("-10%".parse::<MemoryLimit>().is_err());
assert!("invalid".parse::<MemoryLimit>().is_err());
}
#[test]
fn test_resolve() {
let total = 8 * 1024 * 1024 * 1024; // 8GB
assert_eq!(
MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024)).resolve(total),
2 * 1024 * 1024 * 1024
);
assert_eq!(
MemoryLimit::Percentage(50).resolve(total),
4 * 1024 * 1024 * 1024
);
assert_eq!(MemoryLimit::Unlimited.resolve(total), 0);
}
#[test]
fn test_is_unlimited() {
assert!(MemoryLimit::Unlimited.is_unlimited());
assert!(!MemoryLimit::Size(ReadableSize(1024)).is_unlimited());
assert!(!MemoryLimit::Percentage(50).is_unlimited());
assert!(!MemoryLimit::Percentage(1).is_unlimited());
// Defensive: these states shouldn't exist via public API, but check anyway
assert!(MemoryLimit::Size(ReadableSize(0)).is_unlimited());
assert!(MemoryLimit::Percentage(0).is_unlimited());
}
#[test]
fn test_parse_100_percent() {
assert_eq!(
"100%".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Percentage(100)
);
}
#[test]
fn test_display_percentage() {
assert_eq!(MemoryLimit::Percentage(20).to_string(), "20%");
assert_eq!(MemoryLimit::Percentage(50).to_string(), "50%");
assert_eq!(MemoryLimit::Percentage(100).to_string(), "100%");
}
#[test]
fn test_parse_unlimited() {
assert_eq!(
"unlimited".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
assert_eq!(
"UNLIMITED".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
assert_eq!(
"Unlimited".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
}
#[test]
fn test_display_unlimited() {
assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited");
}
#[test]
fn test_parse_display_roundtrip() {
let cases = vec![
"50%",
"100%",
"1%",
"2GB",
"512MB",
"unlimited",
"UNLIMITED",
"0", // normalized to unlimited
"0%", // normalized to unlimited
];
for input in cases {
let parsed = input.parse::<MemoryLimit>().unwrap();
let displayed = parsed.to_string();
let reparsed = displayed.parse::<MemoryLimit>().unwrap();
assert_eq!(
parsed, reparsed,
"round-trip failed: '{}' -> '{}' -> '{:?}'",
input, displayed, reparsed
);
}
}
#[test]
fn test_zero_normalization() {
// All forms of zero should normalize to Unlimited
assert_eq!("0".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
assert_eq!("0%".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
assert_eq!("0B".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
assert_eq!(
"0KB".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
// Unlimited always displays as "unlimited"
assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited");
}
}

View File

@@ -45,19 +45,3 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap {
header.insert(GREPTIME_DB_HEADER_ERROR_MSG, msg); header.insert(GREPTIME_DB_HEADER_ERROR_MSG, msg);
header header
} }
/// Returns the external root cause of the source error (exclude the current error).
pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> {
// There are some divergence about the behavior of the `sources()` API
// in https://github.com/rust-lang/rust/issues/58520
// So this function iterates the sources manually.
let mut root = err.source();
while let Some(r) = root {
if let Some(s) = r.source() {
root = Some(s);
} else {
break;
}
}
root
}

View File

@@ -104,7 +104,7 @@ impl MetaClientSelector {
let cfg = ChannelConfig::new() let cfg = ChannelConfig::new()
.connect_timeout(Duration::from_secs(30)) .connect_timeout(Duration::from_secs(30))
.timeout(Duration::from_secs(30)); .timeout(Duration::from_secs(30));
let channel_manager = ChannelManager::with_config(cfg, None); let channel_manager = ChannelManager::with_config(cfg);
Self { Self {
meta_client, meta_client,
channel_manager, channel_manager,

View File

@@ -12,12 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use crate::aggrs::vector::avg::VectorAvg;
use crate::aggrs::vector::product::VectorProduct; use crate::aggrs::vector::product::VectorProduct;
use crate::aggrs::vector::sum::VectorSum; use crate::aggrs::vector::sum::VectorSum;
use crate::function_registry::FunctionRegistry; use crate::function_registry::FunctionRegistry;
mod avg;
mod product; mod product;
mod sum; mod sum;
@@ -27,6 +25,5 @@ impl VectorFunction {
pub fn register(registry: &FunctionRegistry) { pub fn register(registry: &FunctionRegistry) {
registry.register_aggr(VectorSum::uadf_impl()); registry.register_aggr(VectorSum::uadf_impl());
registry.register_aggr(VectorProduct::uadf_impl()); registry.register_aggr(VectorProduct::uadf_impl());
registry.register_aggr(VectorAvg::uadf_impl());
} }
} }

View File

@@ -1,270 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::borrow::Cow;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, AsArray, BinaryArray, LargeStringArray, StringArray};
use arrow::compute::sum;
use arrow::datatypes::UInt64Type;
use arrow_schema::{DataType, Field};
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{
Accumulator, AggregateUDF, Signature, SimpleAggregateUDF, TypeSignature, Volatility,
};
use datafusion_functions_aggregate_common::accumulator::AccumulatorArgs;
use nalgebra::{Const, DVector, DVectorView, Dyn, OVector};
use crate::scalars::vector::impl_conv::{
binlit_as_veclit, parse_veclit_from_strlit, veclit_to_binlit,
};
/// The accumulator for the `vec_avg` aggregate function.
#[derive(Debug, Default)]
pub struct VectorAvg {
sum: Option<OVector<f32, Dyn>>,
count: u64,
}
impl VectorAvg {
/// Create a new `AggregateUDF` for the `vec_avg` aggregate function.
pub fn uadf_impl() -> AggregateUDF {
let signature = Signature::one_of(
vec![
TypeSignature::Exact(vec![DataType::Utf8]),
TypeSignature::Exact(vec![DataType::LargeUtf8]),
TypeSignature::Exact(vec![DataType::Binary]),
],
Volatility::Immutable,
);
let udaf = SimpleAggregateUDF::new_with_signature(
"vec_avg",
signature,
DataType::Binary,
Arc::new(Self::accumulator),
vec![
Arc::new(Field::new("sum", DataType::Binary, true)),
Arc::new(Field::new("count", DataType::UInt64, true)),
],
);
AggregateUDF::from(udaf)
}
fn accumulator(args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
if args.schema.fields().len() != 1 {
return Err(datafusion_common::DataFusionError::Internal(format!(
"expect creating `VEC_AVG` with only one input field, actual {}",
args.schema.fields().len()
)));
}
let t = args.schema.field(0).data_type();
if !matches!(t, DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary) {
return Err(datafusion_common::DataFusionError::Internal(format!(
"unexpected input datatype {t} when creating `VEC_AVG`"
)));
}
Ok(Box::new(VectorAvg::default()))
}
fn inner(&mut self, len: usize) -> &mut OVector<f32, Dyn> {
self.sum
.get_or_insert_with(|| OVector::zeros_generic(Dyn(len), Const::<1>))
}
fn update(&mut self, values: &[ArrayRef], is_update: bool) -> Result<()> {
if values.is_empty() {
return Ok(());
};
let vectors = match values[0].data_type() {
DataType::Utf8 => {
let arr: &StringArray = values[0].as_string();
arr.iter()
.filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into)))
.map(|x| x.map(Cow::Owned))
.collect::<Result<Vec<_>>>()?
}
DataType::LargeUtf8 => {
let arr: &LargeStringArray = values[0].as_string();
arr.iter()
.filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into)))
.map(|x: Result<Vec<f32>>| x.map(Cow::Owned))
.collect::<Result<Vec<_>>>()?
}
DataType::Binary => {
let arr: &BinaryArray = values[0].as_binary();
arr.iter()
.filter_map(|x| x.map(|b| binlit_as_veclit(b).map_err(Into::into)))
.collect::<Result<Vec<_>>>()?
}
_ => {
return Err(datafusion_common::DataFusionError::NotImplemented(format!(
"unsupported data type {} for `VEC_AVG`",
values[0].data_type()
)));
}
};
if vectors.is_empty() {
return Ok(());
}
let len = if is_update {
vectors.len() as u64
} else {
sum(values[1].as_primitive::<UInt64Type>()).unwrap_or_default()
};
let dims = vectors[0].len();
let mut sum = DVector::zeros(dims);
for v in vectors {
if v.len() != dims {
return Err(datafusion_common::DataFusionError::Execution(
"vectors length not match: VEC_AVG".to_string(),
));
}
let v_view = DVectorView::from_slice(&v, dims);
sum += &v_view;
}
*self.inner(dims) += sum;
self.count += len;
Ok(())
}
}
impl Accumulator for VectorAvg {
fn state(&mut self) -> Result<Vec<ScalarValue>> {
let vector = match &self.sum {
None => ScalarValue::Binary(None),
Some(sum) => ScalarValue::Binary(Some(veclit_to_binlit(sum.as_slice()))),
};
Ok(vec![vector, ScalarValue::from(self.count)])
}
fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
self.update(values, true)
}
fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
self.update(states, false)
}
fn evaluate(&mut self) -> Result<ScalarValue> {
match &self.sum {
None => Ok(ScalarValue::Binary(None)),
Some(sum) => Ok(ScalarValue::Binary(Some(veclit_to_binlit(
(sum / self.count as f32).as_slice(),
)))),
}
}
fn size(&self) -> usize {
size_of_val(self)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::StringArray;
use datatypes::scalars::ScalarVector;
use datatypes::vectors::{ConstantVector, StringVector, Vector};
use super::*;
#[test]
fn test_update_batch() {
// test update empty batch, expect not updating anything
let mut vec_avg = VectorAvg::default();
vec_avg.update_batch(&[]).unwrap();
assert!(vec_avg.sum.is_none());
assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap());
// test update one not-null value
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
Some("[4.0,5.0,6.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[2.5, 3.5, 4.5]))),
vec_avg.evaluate().unwrap()
);
// test update one null value
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![Option::<String>::None]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap());
// test update no null-value batch
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
Some("[4.0,5.0,6.0]".to_string()),
Some("[7.0,8.0,9.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))),
vec_avg.evaluate().unwrap()
);
// test update null-value batch
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
None,
Some("[7.0,8.0,9.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))),
vec_avg.evaluate().unwrap()
);
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
None,
Some("[4.0,5.0,6.0]".to_string()),
Some("[7.0,8.0,9.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[5.5, 6.5, 7.5]))),
vec_avg.evaluate().unwrap()
);
// test update with constant vector
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![
Arc::new(ConstantVector::new(
Arc::new(StringVector::from_vec(vec!["[1.0,2.0,3.0]".to_string()])),
4,
))
.to_arrow_array(),
];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[1.0, 2.0, 3.0]))),
vec_avg.evaluate().unwrap()
);
}
}

View File

@@ -14,7 +14,6 @@
mod convert; mod convert;
mod distance; mod distance;
mod elem_avg;
mod elem_product; mod elem_product;
mod elem_sum; mod elem_sum;
pub mod impl_conv; pub mod impl_conv;
@@ -65,7 +64,6 @@ impl VectorFunction {
registry.register_scalar(vector_subvector::VectorSubvectorFunction::default()); registry.register_scalar(vector_subvector::VectorSubvectorFunction::default());
registry.register_scalar(elem_sum::ElemSumFunction::default()); registry.register_scalar(elem_sum::ElemSumFunction::default());
registry.register_scalar(elem_product::ElemProductFunction::default()); registry.register_scalar(elem_product::ElemProductFunction::default());
registry.register_scalar(elem_avg::ElemAvgFunction::default());
} }
} }

View File

@@ -1,128 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::Display;
use datafusion::arrow::datatypes::DataType;
use datafusion::logical_expr::ColumnarValue;
use datafusion_common::ScalarValue;
use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS};
use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use nalgebra::DVectorView;
use crate::function::Function;
use crate::scalars::vector::{VectorCalculator, impl_conv};
const NAME: &str = "vec_elem_avg";
#[derive(Debug, Clone)]
pub(crate) struct ElemAvgFunction {
signature: Signature,
}
impl Default for ElemAvgFunction {
fn default() -> Self {
Self {
signature: Signature::one_of(
vec![
TypeSignature::Uniform(1, STRINGS.to_vec()),
TypeSignature::Uniform(1, BINARYS.to_vec()),
TypeSignature::Uniform(1, vec![DataType::BinaryView]),
],
Volatility::Immutable,
),
}
}
}
impl Function for ElemAvgFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Float32)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let body = |v0: &ScalarValue| -> datafusion_common::Result<ScalarValue> {
let v0 =
impl_conv::as_veclit(v0)?.map(|v0| DVectorView::from_slice(&v0, v0.len()).mean());
Ok(ScalarValue::Float32(v0))
};
let calculator = VectorCalculator {
name: self.name(),
func: body,
};
calculator.invoke_with_single_argument(args)
}
}
impl Display for ElemAvgFunction {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::StringViewArray;
use arrow_schema::Field;
use datafusion::arrow::array::{Array, AsArray};
use datafusion::arrow::datatypes::Float32Type;
use datafusion_common::config::ConfigOptions;
use super::*;
#[test]
fn test_elem_avg() {
let func = ElemAvgFunction::default();
let input = Arc::new(StringViewArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
Some("[4.0,5.0,6.0]".to_string()),
Some("[7.0,8.0,9.0]".to_string()),
None,
]));
let result = func
.invoke_with_args(ScalarFunctionArgs {
args: vec![ColumnarValue::Array(input.clone())],
arg_fields: vec![],
number_rows: input.len(),
return_field: Arc::new(Field::new("x", DataType::Float32, true)),
config_options: Arc::new(ConfigOptions::new()),
})
.and_then(|v| ColumnarValue::values_to_arrays(&[v]))
.map(|mut a| a.remove(0))
.unwrap();
let result = result.as_primitive::<Float32Type>();
assert_eq!(result.len(), 4);
assert_eq!(result.value(0), 2.0);
assert_eq!(result.value(1), 5.0);
assert_eq!(result.value(2), 8.0);
assert!(result.is_null(3));
}
}

View File

@@ -22,14 +22,14 @@ use dashmap::DashMap;
use dashmap::mapref::entry::Entry; use dashmap::mapref::entry::Entry;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use snafu::ResultExt; use snafu::{OptionExt, ResultExt};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tonic::transport::{ use tonic::transport::{
Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri, Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri,
}; };
use tower::Service; use tower::Service;
use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, Result}; use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, InvalidTlsConfigSnafu, Result};
const RECYCLE_CHANNEL_INTERVAL_SECS: u64 = 60; const RECYCLE_CHANNEL_INTERVAL_SECS: u64 = 60;
pub const DEFAULT_GRPC_REQUEST_TIMEOUT_SECS: u64 = 10; pub const DEFAULT_GRPC_REQUEST_TIMEOUT_SECS: u64 = 10;
@@ -91,18 +91,57 @@ impl ChannelManager {
Default::default() Default::default()
} }
/// unified with config function that support tls config pub fn with_config(config: ChannelConfig) -> Self {
/// use [`load_tls_config`] to load tls config from file system let inner = Inner::with_config(config);
pub fn with_config(config: ChannelConfig, tls_config: Option<ClientTlsConfig>) -> Self {
let mut inner = Inner::with_config(config.clone());
if let Some(tls_config) = tls_config {
inner.client_tls_config = Some(tls_config);
}
Self { Self {
inner: Arc::new(inner), inner: Arc::new(inner),
} }
} }
/// Read tls cert and key files and create a ChannelManager with TLS config.
pub fn with_tls_config(config: ChannelConfig) -> Result<Self> {
let mut inner = Inner::with_config(config.clone());
// setup tls
let path_config = config.client_tls.context(InvalidTlsConfigSnafu {
msg: "no config input",
})?;
if !path_config.enabled {
// if TLS not enabled, just ignore other tls config
// and not set `client_tls_config` hence not use TLS
return Ok(Self {
inner: Arc::new(inner),
});
}
let mut tls_config = ClientTlsConfig::new();
if let Some(server_ca) = path_config.server_ca_cert_path {
let server_root_ca_cert =
std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?;
let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert);
tls_config = tls_config.ca_certificate(server_root_ca_cert);
}
if let (Some(client_cert_path), Some(client_key_path)) =
(&path_config.client_cert_path, &path_config.client_key_path)
{
let client_cert =
std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?;
let client_key =
std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?;
let client_identity = Identity::from_pem(client_cert, client_key);
tls_config = tls_config.identity(client_identity);
}
inner.client_tls_config = Some(tls_config);
Ok(Self {
inner: Arc::new(inner),
})
}
pub fn config(&self) -> &ChannelConfig { pub fn config(&self) -> &ChannelConfig {
&self.inner.config &self.inner.config
} }
@@ -248,34 +287,6 @@ impl ChannelManager {
} }
} }
pub fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result<Option<ClientTlsConfig>> {
let path_config = match tls_option {
Some(path_config) if path_config.enabled => path_config,
_ => return Ok(None),
};
let mut tls_config = ClientTlsConfig::new();
if let Some(server_ca) = &path_config.server_ca_cert_path {
let server_root_ca_cert =
std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?;
let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert);
tls_config = tls_config.ca_certificate(server_root_ca_cert);
}
if let (Some(client_cert_path), Some(client_key_path)) =
(&path_config.client_cert_path, &path_config.client_key_path)
{
let client_cert =
std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?;
let client_key =
std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?;
let client_identity = Identity::from_pem(client_cert, client_key);
tls_config = tls_config.identity(client_identity);
}
Ok(Some(tls_config))
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct ClientTlsOption { pub struct ClientTlsOption {
/// Whether to enable TLS for client. /// Whether to enable TLS for client.
@@ -648,7 +659,7 @@ mod tests {
.http2_adaptive_window(true) .http2_adaptive_window(true)
.tcp_keepalive(Duration::from_secs(2)) .tcp_keepalive(Duration::from_secs(2))
.tcp_nodelay(true); .tcp_nodelay(true);
let mgr = ChannelManager::with_config(config, None); let mgr = ChannelManager::with_config(config);
let res = mgr.build_endpoint("test_addr"); let res = mgr.build_endpoint("test_addr");

View File

@@ -12,17 +12,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use common_grpc::channel_manager::{ use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption};
ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
};
#[tokio::test] #[tokio::test]
async fn test_mtls_config() { async fn test_mtls_config() {
// test no config // test no config
let config = ChannelConfig::new(); let config = ChannelConfig::new();
let re = load_tls_config(config.client_tls.as_ref()); let re = ChannelManager::with_tls_config(config);
assert!(re.is_ok()); assert!(re.is_err());
assert!(re.unwrap().is_none());
// test wrong file // test wrong file
let config = ChannelConfig::new().client_tls_config(ClientTlsOption { let config = ChannelConfig::new().client_tls_config(ClientTlsOption {
@@ -32,7 +29,7 @@ async fn test_mtls_config() {
client_key_path: Some("tests/tls/wrong_client.key".to_string()), client_key_path: Some("tests/tls/wrong_client.key".to_string()),
}); });
let re = load_tls_config(config.client_tls.as_ref()); let re = ChannelManager::with_tls_config(config);
assert!(re.is_err()); assert!(re.is_err());
// test corrupted file content // test corrupted file content
@@ -43,9 +40,7 @@ async fn test_mtls_config() {
client_key_path: Some("tests/tls/corrupted".to_string()), client_key_path: Some("tests/tls/corrupted".to_string()),
}); });
let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap(); let re = ChannelManager::with_tls_config(config).unwrap();
let re = ChannelManager::with_config(config, tls_config);
let re = re.get("127.0.0.1:0"); let re = re.get("127.0.0.1:0");
assert!(re.is_err()); assert!(re.is_err());
@@ -57,8 +52,7 @@ async fn test_mtls_config() {
client_key_path: Some("tests/tls/client.key".to_string()), client_key_path: Some("tests/tls/client.key".to_string()),
}); });
let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap(); let re = ChannelManager::with_tls_config(config).unwrap();
let re = ChannelManager::with_config(config, tls_config);
let re = re.get("127.0.0.1:0"); let re = re.get("127.0.0.1:0");
let _ = re.unwrap(); let _ = re.unwrap();
} }

View File

@@ -77,10 +77,7 @@ serde_json.workspace = true
serde_with.workspace = true serde_with.workspace = true
session.workspace = true session.workspace = true
snafu.workspace = true snafu.workspace = true
sqlx = { workspace = true, features = [ sqlx = { workspace = true, optional = true }
"mysql",
"chrono",
], optional = true }
store-api.workspace = true store-api.workspace = true
strum.workspace = true strum.workspace = true
table = { workspace = true, features = ["testing"] } table = { workspace = true, features = ["testing"] }

View File

@@ -442,7 +442,7 @@ pub fn extract_column_metadatas(
results: &mut [RegionResponse], results: &mut [RegionResponse],
key: &str, key: &str,
) -> Result<Option<Vec<ColumnMetadata>>> { ) -> Result<Option<Vec<ColumnMetadata>>> {
let mut schemas = results let schemas = results
.iter_mut() .iter_mut()
.map(|r| r.extensions.remove(key)) .map(|r| r.extensions.remove(key))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
@@ -454,24 +454,20 @@ pub fn extract_column_metadatas(
// Verify all the physical schemas are the same // Verify all the physical schemas are the same
// Safety: previous check ensures this vec is not empty // Safety: previous check ensures this vec is not empty
let first_column_metadatas = schemas let first = schemas.first().unwrap();
.swap_remove(0) ensure!(
.map(|first_bytes| ColumnMetadata::decode_list(&first_bytes).context(DecodeJsonSnafu)) schemas.iter().all(|x| x == first),
.transpose()?; MetadataCorruptionSnafu {
err_msg: "The table column metadata schemas from datanodes are not the same."
}
);
for s in schemas { if let Some(first) = first {
// check decoded column metadata instead of bytes because it contains extension map. let column_metadatas = ColumnMetadata::decode_list(first).context(DecodeJsonSnafu)?;
let column_metadata = s Ok(Some(column_metadatas))
.map(|bytes| ColumnMetadata::decode_list(&bytes).context(DecodeJsonSnafu)) } else {
.transpose()?; Ok(None)
ensure!(
column_metadata == first_column_metadatas,
MetadataCorruptionSnafu {
err_msg: "The table column metadata schemas from datanodes are not the same."
}
);
} }
Ok(first_column_metadatas)
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -250,7 +250,7 @@ pub struct UpgradeRegion {
/// `None` stands for no wait, /// `None` stands for no wait,
/// it's helpful to verify whether the leader region is ready. /// it's helpful to verify whether the leader region is ready.
#[serde(with = "humantime_serde")] #[serde(with = "humantime_serde")]
pub replay_timeout: Duration, pub replay_timeout: Option<Duration>,
/// The hint for replaying memtable. /// The hint for replaying memtable.
#[serde(default)] #[serde(default)]
pub location_id: Option<u64>, pub location_id: Option<u64>,
@@ -507,14 +507,13 @@ pub enum Instruction {
/// Closes regions. /// Closes regions.
#[serde(deserialize_with = "single_or_multiple_from", alias = "CloseRegion")] #[serde(deserialize_with = "single_or_multiple_from", alias = "CloseRegion")]
CloseRegions(Vec<RegionIdent>), CloseRegions(Vec<RegionIdent>),
/// Upgrades regions. /// Upgrades a region.
#[serde(deserialize_with = "single_or_multiple_from", alias = "UpgradeRegion")] UpgradeRegion(UpgradeRegion),
UpgradeRegions(Vec<UpgradeRegion>),
#[serde( #[serde(
deserialize_with = "single_or_multiple_from", deserialize_with = "single_or_multiple_from",
alias = "DowngradeRegion" alias = "DowngradeRegion"
)] )]
/// Downgrades regions. /// Downgrades a region.
DowngradeRegions(Vec<DowngradeRegion>), DowngradeRegions(Vec<DowngradeRegion>),
/// Invalidates batch cache. /// Invalidates batch cache.
InvalidateCaches(Vec<CacheIdent>), InvalidateCaches(Vec<CacheIdent>),
@@ -560,9 +559,9 @@ impl Instruction {
} }
/// Converts the instruction into a [UpgradeRegion]. /// Converts the instruction into a [UpgradeRegion].
pub fn into_upgrade_regions(self) -> Option<Vec<UpgradeRegion>> { pub fn into_upgrade_regions(self) -> Option<UpgradeRegion> {
match self { match self {
Self::UpgradeRegions(upgrade_region) => Some(upgrade_region), Self::UpgradeRegion(upgrade_region) => Some(upgrade_region),
_ => None, _ => None,
} }
} }
@@ -585,10 +584,6 @@ impl Instruction {
/// The reply of [UpgradeRegion]. /// The reply of [UpgradeRegion].
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct UpgradeRegionReply { pub struct UpgradeRegionReply {
/// The [RegionId].
/// For compatibility, it is defaulted to [RegionId::new(0, 0)].
#[serde(default)]
pub region_id: RegionId,
/// Returns true if `last_entry_id` has been replayed to the latest. /// Returns true if `last_entry_id` has been replayed to the latest.
pub ready: bool, pub ready: bool,
/// Indicates whether the region exists. /// Indicates whether the region exists.
@@ -640,39 +635,6 @@ where
}) })
} }
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct UpgradeRegionsReply {
pub replies: Vec<UpgradeRegionReply>,
}
impl UpgradeRegionsReply {
pub fn new(replies: Vec<UpgradeRegionReply>) -> Self {
Self { replies }
}
pub fn single(reply: UpgradeRegionReply) -> Self {
Self::new(vec![reply])
}
}
#[derive(Deserialize)]
#[serde(untagged)]
enum UpgradeRegionsCompat {
Single(UpgradeRegionReply),
Multiple(UpgradeRegionsReply),
}
fn upgrade_regions_compat_from<'de, D>(deserializer: D) -> Result<UpgradeRegionsReply, D::Error>
where
D: Deserializer<'de>,
{
let helper = UpgradeRegionsCompat::deserialize(deserializer)?;
Ok(match helper {
UpgradeRegionsCompat::Single(x) => UpgradeRegionsReply::new(vec![x]),
UpgradeRegionsCompat::Multiple(reply) => reply,
})
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(tag = "type", rename_all = "snake_case")] #[serde(tag = "type", rename_all = "snake_case")]
pub enum InstructionReply { pub enum InstructionReply {
@@ -680,11 +642,7 @@ pub enum InstructionReply {
OpenRegions(SimpleReply), OpenRegions(SimpleReply),
#[serde(alias = "close_region")] #[serde(alias = "close_region")]
CloseRegions(SimpleReply), CloseRegions(SimpleReply),
#[serde( UpgradeRegion(UpgradeRegionReply),
deserialize_with = "upgrade_regions_compat_from",
alias = "upgrade_region"
)]
UpgradeRegions(UpgradeRegionsReply),
#[serde( #[serde(
alias = "downgrade_region", alias = "downgrade_region",
deserialize_with = "downgrade_regions_compat_from" deserialize_with = "downgrade_regions_compat_from"
@@ -700,11 +658,9 @@ impl Display for InstructionReply {
match self { match self {
Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply), Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply),
Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply), Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply),
Self::UpgradeRegions(reply) => { Self::UpgradeRegion(reply) => write!(f, "InstructionReply::UpgradeRegion({})", reply),
write!(f, "InstructionReply::UpgradeRegions({:?})", reply.replies)
}
Self::DowngradeRegions(reply) => { Self::DowngradeRegions(reply) => {
write!(f, "InstructionReply::DowngradeRegions({:?})", reply.replies) write!(f, "InstructionReply::DowngradeRegions({:?})", reply)
} }
Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply), Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply),
Self::GetFileRefs(reply) => write!(f, "InstructionReply::GetFileRefs({})", reply), Self::GetFileRefs(reply) => write!(f, "InstructionReply::GetFileRefs({})", reply),
@@ -729,9 +685,9 @@ impl InstructionReply {
} }
} }
pub fn expect_upgrade_regions_reply(self) -> Vec<UpgradeRegionReply> { pub fn expect_upgrade_region_reply(self) -> UpgradeRegionReply {
match self { match self {
Self::UpgradeRegions(reply) => reply.replies, Self::UpgradeRegion(reply) => reply,
_ => panic!("Expected UpgradeRegion reply"), _ => panic!("Expected UpgradeRegion reply"),
} }
} }
@@ -793,58 +749,25 @@ mod tests {
serialized serialized
); );
let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion { let downgrade_region = InstructionReply::DowngradeRegions(DowngradeRegionsReply::single(
region_id: RegionId::new(1024, 1), DowngradeRegionReply {
last_entry_id: None,
metadata_last_entry_id: None,
replay_timeout: Duration::from_millis(1000),
location_id: None,
replay_entry_id: None,
metadata_replay_entry_id: None,
}]);
let serialized = serde_json::to_string(&upgrade_region).unwrap();
assert_eq!(
r#"{"UpgradeRegions":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null}]}"#,
serialized
);
}
#[test]
fn test_serialize_instruction_reply() {
let downgrade_region_reply = InstructionReply::DowngradeRegions(
DowngradeRegionsReply::single(DowngradeRegionReply {
region_id: RegionId::new(1024, 1), region_id: RegionId::new(1024, 1),
last_entry_id: None, last_entry_id: None,
metadata_last_entry_id: None, metadata_last_entry_id: None,
exists: true, exists: true,
error: None, error: None,
}), },
); ));
let serialized = serde_json::to_string(&downgrade_region_reply).unwrap(); let serialized = serde_json::to_string(&downgrade_region).unwrap();
assert_eq!( assert_eq!(
r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#, r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#,
serialized serialized
); )
let upgrade_region_reply =
InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply {
region_id: RegionId::new(1024, 1),
ready: true,
exists: true,
error: None,
}));
let serialized = serde_json::to_string(&upgrade_region_reply).unwrap();
assert_eq!(
r#"{"type":"upgrade_regions","replies":[{"region_id":4398046511105,"ready":true,"exists":true,"error":null}]}"#,
serialized
);
} }
#[test] #[test]
fn test_deserialize_instruction() { fn test_deserialize_instruction() {
// legacy open region instruction
let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#; let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#;
let open_region_instruction: Instruction = let open_region_instruction: Instruction =
serde_json::from_str(open_region_instruction).unwrap(); serde_json::from_str(open_region_instruction).unwrap();
@@ -862,7 +785,6 @@ mod tests {
)]); )]);
assert_eq!(open_region_instruction, open_region); assert_eq!(open_region_instruction, open_region);
// legacy close region instruction
let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#; let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#;
let close_region_instruction: Instruction = let close_region_instruction: Instruction =
serde_json::from_str(close_region_instruction).unwrap(); serde_json::from_str(close_region_instruction).unwrap();
@@ -874,7 +796,6 @@ mod tests {
}]); }]);
assert_eq!(close_region_instruction, close_region); assert_eq!(close_region_instruction, close_region);
// legacy downgrade region instruction
let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#; let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#;
let downgrade_region_instruction: Instruction = let downgrade_region_instruction: Instruction =
serde_json::from_str(downgrade_region_instruction).unwrap(); serde_json::from_str(downgrade_region_instruction).unwrap();
@@ -884,25 +805,6 @@ mod tests {
}]); }]);
assert_eq!(downgrade_region_instruction, downgrade_region); assert_eq!(downgrade_region_instruction, downgrade_region);
// legacy upgrade region instruction
let upgrade_region_instruction = r#"{"UpgradeRegion":{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null,"replay_entry_id":null,"metadata_replay_entry_id":null}}"#;
let upgrade_region_instruction: Instruction =
serde_json::from_str(upgrade_region_instruction).unwrap();
let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion {
region_id: RegionId::new(1024, 1),
last_entry_id: None,
metadata_last_entry_id: None,
replay_timeout: Duration::from_millis(1000),
location_id: None,
replay_entry_id: None,
metadata_replay_entry_id: None,
}]);
assert_eq!(upgrade_region_instruction, upgrade_region);
}
#[test]
fn test_deserialize_instruction_reply() {
// legacy close region reply
let close_region_instruction_reply = let close_region_instruction_reply =
r#"{"result":true,"error":null,"type":"close_region"}"#; r#"{"result":true,"error":null,"type":"close_region"}"#;
let close_region_instruction_reply: InstructionReply = let close_region_instruction_reply: InstructionReply =
@@ -913,7 +815,6 @@ mod tests {
}); });
assert_eq!(close_region_instruction_reply, close_region_reply); assert_eq!(close_region_instruction_reply, close_region_reply);
// legacy open region reply
let open_region_instruction_reply = r#"{"result":true,"error":null,"type":"open_region"}"#; let open_region_instruction_reply = r#"{"result":true,"error":null,"type":"open_region"}"#;
let open_region_instruction_reply: InstructionReply = let open_region_instruction_reply: InstructionReply =
serde_json::from_str(open_region_instruction_reply).unwrap(); serde_json::from_str(open_region_instruction_reply).unwrap();
@@ -923,7 +824,6 @@ mod tests {
}); });
assert_eq!(open_region_instruction_reply, open_region_reply); assert_eq!(open_region_instruction_reply, open_region_reply);
// legacy downgrade region reply
let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#; let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#;
let downgrade_region_instruction_reply: InstructionReply = let downgrade_region_instruction_reply: InstructionReply =
serde_json::from_str(downgrade_region_instruction_reply).unwrap(); serde_json::from_str(downgrade_region_instruction_reply).unwrap();
@@ -937,19 +837,6 @@ mod tests {
}), }),
); );
assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply); assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply);
// legacy upgrade region reply
let upgrade_region_instruction_reply = r#"{"region_id":4398046511105,"ready":true,"exists":true,"error":null,"type":"upgrade_region"}"#;
let upgrade_region_instruction_reply: InstructionReply =
serde_json::from_str(upgrade_region_instruction_reply).unwrap();
let upgrade_region_reply =
InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply {
region_id: RegionId::new(1024, 1),
ready: true,
exists: true,
error: None,
}));
assert_eq!(upgrade_region_instruction_reply, upgrade_region_reply);
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -26,6 +26,7 @@ use datatypes::arrow::datatypes::{
Int32Type, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, Int32Type, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType, TimestampSecondType,
}; };
use datatypes::schema::SchemaRef;
fn prepare_record_batch(rows: usize) -> RecordBatch { fn prepare_record_batch(rows: usize) -> RecordBatch {
let schema = Schema::new(vec![ let schema = Schema::new(vec![
@@ -55,6 +56,14 @@ fn prepare_record_batch(rows: usize) -> RecordBatch {
RecordBatch::try_new(Arc::new(schema), columns).unwrap() RecordBatch::try_new(Arc::new(schema), columns).unwrap()
} }
fn iter_by_greptimedb_values(schema: SchemaRef, record_batch: RecordBatch) {
let record_batch =
common_recordbatch::RecordBatch::try_from_df_record_batch(schema, record_batch).unwrap();
for row in record_batch.rows() {
black_box(row);
}
}
fn iter_by_loop_rows_and_columns(record_batch: RecordBatch) { fn iter_by_loop_rows_and_columns(record_batch: RecordBatch) {
for i in 0..record_batch.num_rows() { for i in 0..record_batch.num_rows() {
for column in record_batch.columns() { for column in record_batch.columns() {
@@ -116,6 +125,19 @@ pub fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("iter_record_batch"); let mut group = c.benchmark_group("iter_record_batch");
for rows in [1usize, 10, 100, 1_000, 10_000] { for rows in [1usize, 10, 100, 1_000, 10_000] {
group.bench_with_input(
BenchmarkId::new("by_greptimedb_values", rows),
&rows,
|b, rows| {
let record_batch = prepare_record_batch(*rows);
let schema =
Arc::new(datatypes::schema::Schema::try_from(record_batch.schema()).unwrap());
b.iter(|| {
iter_by_greptimedb_values(schema.clone(), record_batch.clone());
})
},
);
group.bench_with_input( group.bench_with_input(
BenchmarkId::new("by_loop_rows_and_columns", rows), BenchmarkId::new("by_loop_rows_and_columns", rows),
&rows, &rows,

View File

@@ -193,13 +193,6 @@ pub enum Error {
#[snafu(implicit)] #[snafu(implicit)]
location: Location, location: Location,
}, },
#[snafu(display("Exceeded memory limit: {}", msg))]
ExceedMemoryLimit {
msg: String,
#[snafu(implicit)]
location: Location,
},
} }
impl ErrorExt for Error { impl ErrorExt for Error {
@@ -236,8 +229,6 @@ impl ErrorExt for Error {
Error::StreamTimeout { .. } => StatusCode::Cancelled, Error::StreamTimeout { .. } => StatusCode::Cancelled,
Error::StreamCancelled { .. } => StatusCode::Cancelled, Error::StreamCancelled { .. } => StatusCode::Cancelled,
Error::ExceedMemoryLimit { .. } => StatusCode::RuntimeResourcesExhausted,
} }
} }

View File

@@ -21,14 +21,11 @@ pub mod filter;
mod recordbatch; mod recordbatch;
pub mod util; pub mod util;
use std::fmt;
use std::pin::Pin; use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use adapter::RecordBatchMetrics; use adapter::RecordBatchMetrics;
use arc_swap::ArcSwapOption; use arc_swap::ArcSwapOption;
use common_base::readable_size::ReadableSize;
pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream; pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
use datatypes::arrow::compute::SortOptions; use datatypes::arrow::compute::SortOptions;
pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch; pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
@@ -409,399 +406,6 @@ impl<S: Stream<Item = Result<RecordBatch>> + Unpin> Stream for RecordBatchStream
} }
} }
/// Memory permit for a stream, providing privileged access or rate limiting.
///
/// The permit tracks whether this stream has privileged Top-K status.
/// When dropped, it automatically releases any privileged slot it holds.
pub struct MemoryPermit {
tracker: QueryMemoryTracker,
is_privileged: AtomicBool,
}
impl MemoryPermit {
/// Check if this permit currently has privileged status.
pub fn is_privileged(&self) -> bool {
self.is_privileged.load(Ordering::Acquire)
}
/// Ensure this permit has privileged status by acquiring a slot if available.
/// Returns true if privileged (either already privileged or just acquired privilege).
fn ensure_privileged(&self) -> bool {
if self.is_privileged.load(Ordering::Acquire) {
return true;
}
// Try to claim a privileged slot
self.tracker
.privileged_count
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| {
if count < self.tracker.privileged_slots {
Some(count + 1)
} else {
None
}
})
.map(|_| {
self.is_privileged.store(true, Ordering::Release);
true
})
.unwrap_or(false)
}
/// Track additional memory usage with this permit.
/// Returns error if limit is exceeded.
///
/// # Arguments
/// * `additional` - Additional memory size to track in bytes
/// * `stream_tracked` - Total memory already tracked by this stream
///
/// # Behavior
/// - Privileged streams: Can push global memory usage up to full limit
/// - Standard-tier streams: Can push global memory usage up to limit * standard_tier_memory_fraction (default: 0.7)
/// - Standard-tier streams automatically attempt to acquire privilege if slots become available
/// - The configured limit is absolute hard limit - no stream can exceed it
pub fn track(&self, additional: usize, stream_tracked: usize) -> Result<()> {
// Ensure privileged status if possible
let is_privileged = self.ensure_privileged();
self.tracker
.track_internal(additional, is_privileged, stream_tracked)
}
/// Release tracked memory.
///
/// # Arguments
/// * `amount` - Amount of memory to release in bytes
pub fn release(&self, amount: usize) {
self.tracker.release(amount);
}
}
impl Drop for MemoryPermit {
fn drop(&mut self) {
// Release privileged slot if we had one
if self.is_privileged.load(Ordering::Acquire) {
self.tracker
.privileged_count
.fetch_sub(1, Ordering::Release);
}
}
}
/// Memory tracker for RecordBatch streams. Clone to share the same limit across queries.
///
/// Implements a two-tier memory allocation strategy:
/// - **Privileged tier**: First N streams (default: 20) can use up to the full memory limit
/// - **Standard tier**: Remaining streams are restricted to a fraction of the limit (default: 70%)
/// - Privilege is granted on a first-come-first-served basis
/// - The configured limit is an absolute hard cap - no stream can exceed it
#[derive(Clone)]
pub struct QueryMemoryTracker {
current: Arc<AtomicUsize>,
limit: usize,
standard_tier_memory_fraction: f64,
privileged_count: Arc<AtomicUsize>,
privileged_slots: usize,
on_update: Option<Arc<dyn Fn(usize) + Send + Sync>>,
on_reject: Option<Arc<dyn Fn() + Send + Sync>>,
}
impl fmt::Debug for QueryMemoryTracker {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("QueryMemoryTracker")
.field("current", &self.current.load(Ordering::Acquire))
.field("limit", &self.limit)
.field(
"standard_tier_memory_fraction",
&self.standard_tier_memory_fraction,
)
.field(
"privileged_count",
&self.privileged_count.load(Ordering::Acquire),
)
.field("privileged_slots", &self.privileged_slots)
.field("on_update", &self.on_update.is_some())
.field("on_reject", &self.on_reject.is_some())
.finish()
}
}
impl QueryMemoryTracker {
// Default privileged slots when max_concurrent_queries is 0.
const DEFAULT_PRIVILEGED_SLOTS: usize = 20;
// Ratio for privileged tier: 70% queries get privileged access, standard tier uses 70% memory.
const DEFAULT_PRIVILEGED_TIER_RATIO: f64 = 0.7;
/// Create a new memory tracker with the given limit and max_concurrent_queries.
/// Calculates privileged slots as 70% of max_concurrent_queries (or 20 if max_concurrent_queries is 0).
///
/// # Arguments
/// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited.
/// * `max_concurrent_queries` - Maximum number of concurrent queries (0 = unlimited).
pub fn new(limit: usize, max_concurrent_queries: usize) -> Self {
let privileged_slots = Self::calculate_privileged_slots(max_concurrent_queries);
Self::with_privileged_slots(limit, privileged_slots)
}
/// Create a new memory tracker with custom privileged slots limit.
pub fn with_privileged_slots(limit: usize, privileged_slots: usize) -> Self {
Self::with_config(limit, privileged_slots, Self::DEFAULT_PRIVILEGED_TIER_RATIO)
}
/// Create a new memory tracker with full configuration.
///
/// # Arguments
/// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited.
/// * `privileged_slots` - Maximum number of streams that can get privileged status.
/// * `standard_tier_memory_fraction` - Memory fraction for standard-tier streams (range: [0.0, 1.0]).
///
/// # Panics
/// Panics if `standard_tier_memory_fraction` is not in the range [0.0, 1.0].
pub fn with_config(
limit: usize,
privileged_slots: usize,
standard_tier_memory_fraction: f64,
) -> Self {
assert!(
(0.0..=1.0).contains(&standard_tier_memory_fraction),
"standard_tier_memory_fraction must be in [0.0, 1.0], got {}",
standard_tier_memory_fraction
);
Self {
current: Arc::new(AtomicUsize::new(0)),
limit,
standard_tier_memory_fraction,
privileged_count: Arc::new(AtomicUsize::new(0)),
privileged_slots,
on_update: None,
on_reject: None,
}
}
/// Register a new permit for memory tracking.
/// The first `privileged_slots` permits get privileged status automatically.
/// The returned permit can be shared across multiple streams of the same query.
pub fn register_permit(&self) -> MemoryPermit {
// Try to claim a privileged slot
let is_privileged = self
.privileged_count
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| {
if count < self.privileged_slots {
Some(count + 1)
} else {
None
}
})
.is_ok();
MemoryPermit {
tracker: self.clone(),
is_privileged: AtomicBool::new(is_privileged),
}
}
/// Set a callback to be called whenever the usage changes successfully.
/// The callback receives the new total usage in bytes.
///
/// # Note
/// The callback is called after both successful `track()` and `release()` operations.
/// It is called even when `limit == 0` (unlimited mode) to track actual usage.
pub fn with_on_update<F>(mut self, on_update: F) -> Self
where
F: Fn(usize) + Send + Sync + 'static,
{
self.on_update = Some(Arc::new(on_update));
self
}
/// Set a callback to be called when memory allocation is rejected.
///
/// # Note
/// This is only called when `track()` fails due to exceeding the limit.
/// It is never called when `limit == 0` (unlimited mode).
pub fn with_on_reject<F>(mut self, on_reject: F) -> Self
where
F: Fn() + Send + Sync + 'static,
{
self.on_reject = Some(Arc::new(on_reject));
self
}
/// Get the current memory usage in bytes.
pub fn current(&self) -> usize {
self.current.load(Ordering::Acquire)
}
fn calculate_privileged_slots(max_concurrent_queries: usize) -> usize {
if max_concurrent_queries == 0 {
Self::DEFAULT_PRIVILEGED_SLOTS
} else {
((max_concurrent_queries as f64 * Self::DEFAULT_PRIVILEGED_TIER_RATIO) as usize).max(1)
}
}
/// Internal method to track additional memory usage.
///
/// Called by `MemoryPermit::track()`. Use `MemoryPermit::track()` instead of calling this directly.
fn track_internal(
&self,
additional: usize,
is_privileged: bool,
stream_tracked: usize,
) -> Result<()> {
// Calculate effective global limit based on stream privilege
// Privileged streams: can push global usage up to full limit
// Standard-tier streams: can only push global usage up to fraction of limit
let effective_limit = if is_privileged {
self.limit
} else {
(self.limit as f64 * self.standard_tier_memory_fraction) as usize
};
let mut new_total = 0;
let result = self
.current
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
new_total = current.saturating_add(additional);
if self.limit == 0 {
// Unlimited mode
return Some(new_total);
}
// Check if new global total exceeds effective limit
// The configured limit is absolute hard limit - no stream can exceed it
if new_total <= effective_limit {
Some(new_total)
} else {
None
}
});
match result {
Ok(_) => {
if let Some(callback) = &self.on_update {
callback(new_total);
}
Ok(())
}
Err(current) => {
if let Some(callback) = &self.on_reject {
callback();
}
let msg = format!(
"{} requested, {} used globally ({}%), {} used by this stream (privileged: {}), effective limit: {} ({}%), hard limit: {}",
ReadableSize(additional as u64),
ReadableSize(current as u64),
if self.limit > 0 {
current * 100 / self.limit
} else {
0
},
ReadableSize(stream_tracked as u64),
is_privileged,
ReadableSize(effective_limit as u64),
if self.limit > 0 {
effective_limit * 100 / self.limit
} else {
0
},
ReadableSize(self.limit as u64)
);
error::ExceedMemoryLimitSnafu { msg }.fail()
}
}
}
/// Release tracked memory.
///
/// # Arguments
/// * `amount` - Amount of memory to release in bytes
pub fn release(&self, amount: usize) {
if let Ok(old_value) =
self.current
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
Some(current.saturating_sub(amount))
})
&& let Some(callback) = &self.on_update
{
callback(old_value.saturating_sub(amount));
}
}
}
/// A wrapper stream that tracks memory usage of RecordBatches.
pub struct MemoryTrackedStream {
inner: SendableRecordBatchStream,
permit: Arc<MemoryPermit>,
// Total tracked size, released when stream drops.
total_tracked: usize,
}
impl MemoryTrackedStream {
pub fn new(inner: SendableRecordBatchStream, permit: Arc<MemoryPermit>) -> Self {
Self {
inner,
permit,
total_tracked: 0,
}
}
}
impl Stream for MemoryTrackedStream {
type Item = Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match Pin::new(&mut self.inner).poll_next(cx) {
Poll::Ready(Some(Ok(batch))) => {
let additional = batch
.columns()
.iter()
.map(|c| c.memory_size())
.sum::<usize>();
if let Err(e) = self.permit.track(additional, self.total_tracked) {
return Poll::Ready(Some(Err(e)));
}
self.total_tracked += additional;
Poll::Ready(Some(Ok(batch)))
}
Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
Poll::Ready(None) => Poll::Ready(None),
Poll::Pending => Poll::Pending,
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl Drop for MemoryTrackedStream {
fn drop(&mut self) {
if self.total_tracked > 0 {
self.permit.release(self.total_tracked);
}
}
}
impl RecordBatchStream for MemoryTrackedStream {
fn schema(&self) -> SchemaRef {
self.inner.schema()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
self.inner.output_ordering()
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
self.inner.metrics()
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::sync::Arc; use std::sync::Arc;
@@ -892,157 +496,4 @@ mod tests {
assert_eq!(collected[0], batch1); assert_eq!(collected[0], batch1);
assert_eq!(collected[1], batch2); assert_eq!(collected[1], batch2);
} }
#[test]
fn test_query_memory_tracker_basic() {
let tracker = Arc::new(QueryMemoryTracker::new(1000, 0));
// Register first stream - should get privileged status
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
// Privileged stream can use up to limit
assert!(permit1.track(500, 0).is_ok());
assert_eq!(tracker.current(), 500);
// Register second stream - also privileged
let permit2 = tracker.register_permit();
assert!(permit2.is_privileged());
// Can add more but cannot exceed hard limit (1000)
assert!(permit2.track(400, 0).is_ok());
assert_eq!(tracker.current(), 900);
permit1.release(500);
permit2.release(400);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_privileged_limit() {
// Privileged slots = 2 for easy testing
// Limit: 1000, standard-tier fraction: 0.7 (default)
// Privileged can push global to 1000, standard-tier can push global to 700
let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 2));
// First 2 streams are privileged
let permit1 = tracker.register_permit();
let permit2 = tracker.register_permit();
assert!(permit1.is_privileged());
assert!(permit2.is_privileged());
// Third stream is standard-tier (not privileged)
let permit3 = tracker.register_permit();
assert!(!permit3.is_privileged());
// Privileged stream uses some memory
assert!(permit1.track(300, 0).is_ok());
assert_eq!(tracker.current(), 300);
// Standard-tier can add up to 400 (total becomes 700, its effective limit)
assert!(permit3.track(400, 0).is_ok());
assert_eq!(tracker.current(), 700);
// Standard-tier stream cannot push global beyond 700
let err = permit3.track(100, 400).unwrap_err();
let err_msg = err.to_string();
assert!(err_msg.contains("400B used by this stream"));
assert!(err_msg.contains("effective limit: 700B (70%)"));
assert!(err_msg.contains("700B used globally (70%)"));
assert_eq!(tracker.current(), 700);
permit1.release(300);
permit3.release(400);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_promotion() {
// Privileged slots = 1 for easy testing
let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1));
// First stream is privileged
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
// Second stream is standard-tier (can only use 500)
let permit2 = tracker.register_permit();
assert!(!permit2.is_privileged());
// Standard-tier can only track 500
assert!(permit2.track(400, 0).is_ok());
assert_eq!(tracker.current(), 400);
// Drop first permit to release privileged slot
drop(permit1);
// Second stream can now be promoted and use more memory
assert!(permit2.track(500, 400).is_ok());
assert!(permit2.is_privileged());
assert_eq!(tracker.current(), 900);
permit2.release(900);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_privileged_hard_limit() {
// Test that the configured limit is absolute hard limit for all streams
// Privileged: can use full limit (1000)
// Standard-tier: can use 0.7x limit (700 with defaults)
let tracker = Arc::new(QueryMemoryTracker::new(1000, 0));
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
// Privileged can use up to full limit (1000)
assert!(permit1.track(900, 0).is_ok());
assert_eq!(tracker.current(), 900);
// Privileged cannot exceed hard limit (1000)
assert!(permit1.track(200, 900).is_err());
assert_eq!(tracker.current(), 900);
// Can add within hard limit
assert!(permit1.track(100, 900).is_ok());
assert_eq!(tracker.current(), 1000);
// Cannot exceed even by 1 byte
assert!(permit1.track(1, 1000).is_err());
assert_eq!(tracker.current(), 1000);
permit1.release(1000);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_standard_tier_fraction() {
// Test standard-tier streams use fraction of limit
// Limit: 1000, default fraction: 0.7, so standard-tier can use 700
let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1));
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
let permit2 = tracker.register_permit();
assert!(!permit2.is_privileged());
// Standard-tier can use up to 700 (1000 * 0.7 default)
assert!(permit2.track(600, 0).is_ok());
assert_eq!(tracker.current(), 600);
// Cannot exceed standard-tier limit (700)
assert!(permit2.track(200, 600).is_err());
assert_eq!(tracker.current(), 600);
// Can add within standard-tier limit
assert!(permit2.track(100, 600).is_ok());
assert_eq!(tracker.current(), 700);
// Cannot exceed standard-tier limit
assert!(permit2.track(1, 700).is_err());
assert_eq!(tracker.current(), 700);
permit2.release(700);
assert_eq!(tracker.current(), 0);
}
} }

View File

@@ -23,6 +23,7 @@ use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef a
use datatypes::arrow::array::RecordBatchOptions; use datatypes::arrow::array::RecordBatchOptions;
use datatypes::prelude::DataType; use datatypes::prelude::DataType;
use datatypes::schema::SchemaRef; use datatypes::schema::SchemaRef;
use datatypes::value::Value;
use datatypes::vectors::{Helper, VectorRef}; use datatypes::vectors::{Helper, VectorRef};
use serde::ser::{Error, SerializeStruct}; use serde::ser::{Error, SerializeStruct};
use serde::{Serialize, Serializer}; use serde::{Serialize, Serializer};
@@ -193,6 +194,11 @@ impl RecordBatch {
self.df_record_batch.num_rows() self.df_record_batch.num_rows()
} }
/// Create an iterator to traverse the data by row
pub fn rows(&self) -> RecordBatchRowIterator<'_> {
RecordBatchRowIterator::new(self)
}
pub fn column_vectors( pub fn column_vectors(
&self, &self,
table_name: &str, table_name: &str,
@@ -271,6 +277,44 @@ impl Serialize for RecordBatch {
} }
} }
pub struct RecordBatchRowIterator<'a> {
record_batch: &'a RecordBatch,
rows: usize,
columns: usize,
row_cursor: usize,
}
impl<'a> RecordBatchRowIterator<'a> {
fn new(record_batch: &'a RecordBatch) -> RecordBatchRowIterator<'a> {
RecordBatchRowIterator {
record_batch,
rows: record_batch.df_record_batch.num_rows(),
columns: record_batch.df_record_batch.num_columns(),
row_cursor: 0,
}
}
}
impl Iterator for RecordBatchRowIterator<'_> {
type Item = Vec<Value>;
fn next(&mut self) -> Option<Self::Item> {
if self.row_cursor == self.rows {
None
} else {
let mut row = Vec::with_capacity(self.columns);
for col in 0..self.columns {
let column = self.record_batch.column(col);
row.push(column.get(self.row_cursor));
}
self.row_cursor += 1;
Some(row)
}
}
}
/// merge multiple recordbatch into a single /// merge multiple recordbatch into a single
pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Result<RecordBatch> { pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Result<RecordBatch> {
let batches_len = batches.len(); let batches_len = batches.len();
@@ -305,9 +349,7 @@ pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Resul
mod tests { mod tests {
use std::sync::Arc; use std::sync::Arc;
use datatypes::arrow::array::{AsArray, UInt32Array}; use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, UInt32Type};
use datatypes::arrow_array::StringArray;
use datatypes::data_type::ConcreteDataType; use datatypes::data_type::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema}; use datatypes::schema::{ColumnSchema, Schema};
use datatypes::vectors::{StringVector, UInt32Vector}; use datatypes::vectors::{StringVector, UInt32Vector};
@@ -365,6 +407,64 @@ mod tests {
); );
} }
#[test]
fn test_record_batch_visitor() {
let column_schemas = vec![
ColumnSchema::new("numbers", ConcreteDataType::uint32_datatype(), false),
ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true),
];
let schema = Arc::new(Schema::new(column_schemas));
let columns: Vec<VectorRef> = vec![
Arc::new(UInt32Vector::from_slice(vec![1, 2, 3, 4])),
Arc::new(StringVector::from(vec![
None,
Some("hello"),
Some("greptime"),
None,
])),
];
let recordbatch = RecordBatch::new(schema, columns).unwrap();
let mut record_batch_iter = recordbatch.rows();
assert_eq!(
vec![Value::UInt32(1), Value::Null],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert_eq!(
vec![Value::UInt32(2), Value::String("hello".into())],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert_eq!(
vec![Value::UInt32(3), Value::String("greptime".into())],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert_eq!(
vec![Value::UInt32(4), Value::Null],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert!(record_batch_iter.next().is_none());
}
#[test] #[test]
fn test_record_batch_slice() { fn test_record_batch_slice() {
let column_schemas = vec![ let column_schemas = vec![
@@ -383,16 +483,26 @@ mod tests {
]; ];
let recordbatch = RecordBatch::new(schema, columns).unwrap(); let recordbatch = RecordBatch::new(schema, columns).unwrap();
let recordbatch = recordbatch.slice(1, 2).expect("recordbatch slice"); let recordbatch = recordbatch.slice(1, 2).expect("recordbatch slice");
let mut record_batch_iter = recordbatch.rows();
assert_eq!(
vec![Value::UInt32(2), Value::String("hello".into())],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
let expected = &UInt32Array::from_iter_values([2u32, 3]); assert_eq!(
let array = recordbatch.column(0).to_arrow_array(); vec![Value::UInt32(3), Value::String("greptime".into())],
let actual = array.as_primitive::<UInt32Type>(); record_batch_iter
assert_eq!(expected, actual); .next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
let expected = &StringArray::from(vec!["hello", "greptime"]); assert!(record_batch_iter.next().is_none());
let array = recordbatch.column(1).to_arrow_array();
let actual = array.as_string::<i32>();
assert_eq!(expected, actual);
assert!(recordbatch.slice(1, 5).is_err()); assert!(recordbatch.slice(1, 5).is_err());
} }

View File

@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
use std::fmt::Display; use std::fmt::Display;
use std::str::FromStr;
use chrono::{FixedOffset, TimeZone}; use chrono::{FixedOffset, TimeZone};
use chrono_tz::{OffsetComponents, Tz}; use chrono_tz::{OffsetComponents, Tz};
@@ -101,7 +102,7 @@ impl Timezone {
.parse::<u32>() .parse::<u32>()
.context(ParseOffsetStrSnafu { raw: tz_string })?; .context(ParseOffsetStrSnafu { raw: tz_string })?;
Self::hours_mins_opt(hrs, mins) Self::hours_mins_opt(hrs, mins)
} else if let Ok(tz) = Tz::from_str_insensitive(tz_string) { } else if let Ok(tz) = Tz::from_str(tz_string) {
Ok(Self::Named(tz)) Ok(Self::Named(tz))
} else { } else {
ParseTimezoneNameSnafu { raw: tz_string }.fail() ParseTimezoneNameSnafu { raw: tz_string }.fail()
@@ -202,10 +203,6 @@ mod tests {
Timezone::Named(Tz::Asia__Shanghai), Timezone::Named(Tz::Asia__Shanghai),
Timezone::from_tz_string("Asia/Shanghai").unwrap() Timezone::from_tz_string("Asia/Shanghai").unwrap()
); );
assert_eq!(
Timezone::Named(Tz::Asia__Shanghai),
Timezone::from_tz_string("Asia/ShangHai").unwrap()
);
assert_eq!( assert_eq!(
Timezone::Named(Tz::UTC), Timezone::Named(Tz::UTC),
Timezone::from_tz_string("UTC").unwrap() Timezone::from_tz_string("UTC").unwrap()

View File

@@ -522,7 +522,6 @@ impl DatanodeBuilder {
file_ref_manager, file_ref_manager,
partition_expr_fetcher.clone(), partition_expr_fetcher.clone(),
plugins, plugins,
opts.max_concurrent_queries,
); );
#[cfg(feature = "enterprise")] #[cfg(feature = "enterprise")]
@@ -565,7 +564,6 @@ impl DatanodeBuilder {
file_ref_manager, file_ref_manager,
partition_expr_fetcher, partition_expr_fetcher,
plugins, plugins,
opts.max_concurrent_queries,
); );
#[cfg(feature = "enterprise")] #[cfg(feature = "enterprise")]
@@ -587,7 +585,6 @@ impl DatanodeBuilder {
file_ref_manager, file_ref_manager,
partition_expr_fetcher.clone(), partition_expr_fetcher.clone(),
plugins, plugins,
opts.max_concurrent_queries,
); );
#[cfg(feature = "enterprise")] #[cfg(feature = "enterprise")]

View File

@@ -44,6 +44,7 @@ use crate::region_server::RegionServer;
#[derive(Clone)] #[derive(Clone)]
pub struct RegionHeartbeatResponseHandler { pub struct RegionHeartbeatResponseHandler {
region_server: RegionServer, region_server: RegionServer,
catchup_tasks: TaskTracker<()>,
downgrade_tasks: TaskTracker<()>, downgrade_tasks: TaskTracker<()>,
flush_tasks: TaskTracker<()>, flush_tasks: TaskTracker<()>,
open_region_parallelism: usize, open_region_parallelism: usize,
@@ -63,6 +64,7 @@ pub trait InstructionHandler: Send + Sync {
#[derive(Clone)] #[derive(Clone)]
pub struct HandlerContext { pub struct HandlerContext {
region_server: RegionServer, region_server: RegionServer,
catchup_tasks: TaskTracker<()>,
downgrade_tasks: TaskTracker<()>, downgrade_tasks: TaskTracker<()>,
flush_tasks: TaskTracker<()>, flush_tasks: TaskTracker<()>,
gc_tasks: TaskTracker<GcReport>, gc_tasks: TaskTracker<GcReport>,
@@ -73,6 +75,7 @@ impl HandlerContext {
pub fn new_for_test(region_server: RegionServer) -> Self { pub fn new_for_test(region_server: RegionServer) -> Self {
Self { Self {
region_server, region_server,
catchup_tasks: TaskTracker::new(),
downgrade_tasks: TaskTracker::new(), downgrade_tasks: TaskTracker::new(),
flush_tasks: TaskTracker::new(), flush_tasks: TaskTracker::new(),
gc_tasks: TaskTracker::new(), gc_tasks: TaskTracker::new(),
@@ -85,6 +88,7 @@ impl RegionHeartbeatResponseHandler {
pub fn new(region_server: RegionServer) -> Self { pub fn new(region_server: RegionServer) -> Self {
Self { Self {
region_server, region_server,
catchup_tasks: TaskTracker::new(),
downgrade_tasks: TaskTracker::new(), downgrade_tasks: TaskTracker::new(),
flush_tasks: TaskTracker::new(), flush_tasks: TaskTracker::new(),
// Default to half of the number of CPUs. // Default to half of the number of CPUs.
@@ -110,12 +114,7 @@ impl RegionHeartbeatResponseHandler {
)), )),
Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())), Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())),
Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())), Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())),
Instruction::UpgradeRegions(_) => Ok(Box::new( Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler.into())),
UpgradeRegionsHandler {
upgrade_region_parallelism: self.open_region_parallelism,
}
.into(),
)),
Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())), Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())),
Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())), Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())),
Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(), Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
@@ -195,7 +194,7 @@ dispatch_instr!(
OpenRegions => OpenRegions, OpenRegions => OpenRegions,
FlushRegions => FlushRegions, FlushRegions => FlushRegions,
DowngradeRegions => DowngradeRegions, DowngradeRegions => DowngradeRegions,
UpgradeRegions => UpgradeRegions, UpgradeRegion => UpgradeRegions,
GetFileRefs => GetFileRefs, GetFileRefs => GetFileRefs,
GcRegions => GcRegions, GcRegions => GcRegions,
); );
@@ -217,6 +216,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
let mailbox = ctx.mailbox.clone(); let mailbox = ctx.mailbox.clone();
let region_server = self.region_server.clone(); let region_server = self.region_server.clone();
let catchup_tasks = self.catchup_tasks.clone();
let downgrade_tasks = self.downgrade_tasks.clone(); let downgrade_tasks = self.downgrade_tasks.clone();
let flush_tasks = self.flush_tasks.clone(); let flush_tasks = self.flush_tasks.clone();
let gc_tasks = self.gc_tasks.clone(); let gc_tasks = self.gc_tasks.clone();
@@ -226,6 +226,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
.handle( .handle(
&HandlerContext { &HandlerContext {
region_server, region_server,
catchup_tasks,
downgrade_tasks, downgrade_tasks,
flush_tasks, flush_tasks,
gc_tasks, gc_tasks,
@@ -333,10 +334,10 @@ mod tests {
); );
// Upgrade region // Upgrade region
let instruction = Instruction::UpgradeRegions(vec![UpgradeRegion { let instruction = Instruction::UpgradeRegion(UpgradeRegion {
region_id, region_id,
..Default::default() ..Default::default()
}]); });
assert!( assert!(
heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction))) heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction)))
); );

View File

@@ -12,209 +12,125 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use common_error::ext::{BoxedError, ErrorExt}; use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply};
use common_error::status_code::StatusCode; use common_telemetry::{info, warn};
use common_meta::instruction::{ use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint};
InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply,
};
use common_telemetry::{debug, info, warn};
use store_api::region_request::{RegionCatchupRequest, ReplayCheckpoint};
use store_api::storage::RegionId;
use crate::error::Result;
use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
use crate::heartbeat::task_tracker::WaitResult;
#[derive(Debug, Clone, Copy, Default)] #[derive(Debug, Clone, Copy, Default)]
pub struct UpgradeRegionsHandler { pub struct UpgradeRegionsHandler;
pub upgrade_region_parallelism: usize,
}
#[cfg(test)]
impl UpgradeRegionsHandler {
fn new_test() -> UpgradeRegionsHandler {
UpgradeRegionsHandler {
upgrade_region_parallelism: 8,
}
}
}
impl UpgradeRegionsHandler {
fn convert_responses_to_replies(
responses: Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>>,
catchup_regions: &[RegionId],
) -> Vec<UpgradeRegionReply> {
match responses {
Ok(responses) => responses
.into_iter()
.map(|(region_id, result)| match result {
Ok(()) => UpgradeRegionReply {
region_id,
ready: true,
exists: true,
error: None,
},
Err(err) => {
if err.status_code() == StatusCode::RegionNotFound {
UpgradeRegionReply {
region_id,
ready: false,
exists: false,
error: Some(format!("{err:?}")),
}
} else {
UpgradeRegionReply {
region_id,
ready: false,
exists: true,
error: Some(format!("{err:?}")),
}
}
}
})
.collect::<Vec<_>>(),
Err(err) => catchup_regions
.iter()
.map(|region_id| UpgradeRegionReply {
region_id: *region_id,
ready: false,
exists: true,
error: Some(format!("{err:?}")),
})
.collect::<Vec<_>>(),
}
}
}
impl UpgradeRegionsHandler {
// Handles upgrade regions instruction.
//
// Returns batch of upgrade region replies, the order of the replies is not guaranteed.
async fn handle_upgrade_regions(
&self,
ctx: &HandlerContext,
upgrade_regions: Vec<UpgradeRegion>,
) -> Vec<UpgradeRegionReply> {
let num_upgrade_regions = upgrade_regions.len();
let mut replies = Vec::with_capacity(num_upgrade_regions);
let mut catchup_requests = Vec::with_capacity(num_upgrade_regions);
let mut catchup_regions = Vec::with_capacity(num_upgrade_regions);
let mut timeout = None;
for upgrade_region in upgrade_regions {
let Some(writable) = ctx.region_server.is_region_leader(upgrade_region.region_id)
else {
// Region is not found.
debug!("Region {} is not found", upgrade_region.region_id);
replies.push(UpgradeRegionReply {
region_id: upgrade_region.region_id,
ready: false,
exists: false,
error: None,
});
continue;
};
// Ignores the catchup requests for writable regions.
if writable {
warn!(
"Region {} is writable, ignores the catchup request",
upgrade_region.region_id
);
replies.push(UpgradeRegionReply {
region_id: upgrade_region.region_id,
ready: true,
exists: true,
error: None,
});
} else {
let UpgradeRegion {
last_entry_id,
metadata_last_entry_id,
location_id,
replay_entry_id,
metadata_replay_entry_id,
replay_timeout,
..
} = upgrade_region;
match timeout {
Some(timeout) => {
debug_assert_eq!(timeout, replay_timeout);
}
None => {
// TODO(weny): required the replay_timeout.
timeout = Some(replay_timeout);
}
}
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
entry_id,
metadata_entry_id,
}),
_ => None,
};
catchup_regions.push(upgrade_region.region_id);
catchup_requests.push((
upgrade_region.region_id,
RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
metadata_entry_id: metadata_last_entry_id,
location_id,
checkpoint,
},
));
}
}
let Some(timeout) = timeout else {
// No replay timeout, so we don't need to catchup the regions.
info!("All regions are writable, no need to catchup");
debug_assert_eq!(replies.len(), num_upgrade_regions);
return replies;
};
match tokio::time::timeout(
timeout,
ctx.region_server
.handle_batch_catchup_requests(self.upgrade_region_parallelism, catchup_requests),
)
.await
{
Ok(responses) => {
replies.extend(
Self::convert_responses_to_replies(responses, &catchup_regions).into_iter(),
);
}
Err(_) => {
replies.extend(catchup_regions.iter().map(|region_id| UpgradeRegionReply {
region_id: *region_id,
ready: false,
exists: true,
error: None,
}));
}
}
replies
}
}
#[async_trait::async_trait] #[async_trait::async_trait]
impl InstructionHandler for UpgradeRegionsHandler { impl InstructionHandler for UpgradeRegionsHandler {
type Instruction = Vec<UpgradeRegion>; type Instruction = UpgradeRegion;
async fn handle( async fn handle(
&self, &self,
ctx: &HandlerContext, ctx: &HandlerContext,
upgrade_regions: Self::Instruction, UpgradeRegion {
region_id,
last_entry_id,
metadata_last_entry_id,
replay_timeout,
location_id,
replay_entry_id,
metadata_replay_entry_id,
}: UpgradeRegion,
) -> Option<InstructionReply> { ) -> Option<InstructionReply> {
let replies = self.handle_upgrade_regions(ctx, upgrade_regions).await; let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: false,
error: None,
}));
};
Some(InstructionReply::UpgradeRegions(UpgradeRegionsReply::new( if writable {
replies, return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
))) ready: true,
exists: true,
error: None,
}));
}
let region_server_moved = ctx.region_server.clone();
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
entry_id,
metadata_entry_id,
}),
_ => None,
};
// The catchup task is almost zero cost if the inside region is writable.
// Therefore, it always registers a new catchup task.
let register_result = ctx
.catchup_tasks
.try_register(
region_id,
Box::pin(async move {
info!(
"Executing region: {region_id} catchup to: last entry id {last_entry_id:?}"
);
region_server_moved
.handle_request(
region_id,
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
metadata_entry_id: metadata_last_entry_id,
location_id,
checkpoint,
}),
)
.await?;
Ok(())
}),
)
.await;
if register_result.is_busy() {
warn!("Another catchup task is running for the region: {region_id}");
}
// Returns immediately
let Some(replay_timeout) = replay_timeout else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
}));
};
// We don't care that it returns a newly registered or running task.
let mut watcher = register_result.into_watcher();
let result = ctx.catchup_tasks.wait(&mut watcher, replay_timeout).await;
match result {
WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
})),
WaitResult::Finish(Ok(_)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: true,
exists: true,
error: None,
}))
}
WaitResult::Finish(Err(err)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: Some(format!("{err:?}")),
}))
}
}
} }
} }
@@ -226,6 +142,7 @@ mod tests {
use mito2::engine::MITO_ENGINE_NAME; use mito2::engine::MITO_ENGINE_NAME;
use store_api::region_engine::RegionRole; use store_api::region_engine::RegionRole;
use store_api::storage::RegionId; use store_api::storage::RegionId;
use tokio::time::Instant;
use crate::error; use crate::error;
use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler; use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler;
@@ -241,30 +158,21 @@ mod tests {
let handler_context = HandlerContext::new_for_test(mock_region_server); let handler_context = HandlerContext::new_for_test(mock_region_server);
let region_id = RegionId::new(1024, 1); let region_id = RegionId::new(1024, 1);
let region_id2 = RegionId::new(1024, 2); let waits = vec![None, Some(Duration::from_millis(100u64))];
let replay_timeout = Duration::from_millis(100u64);
let reply = UpgradeRegionsHandler::new_test() for replay_timeout in waits {
.handle( let reply = UpgradeRegionsHandler
&handler_context, .handle(
vec![ &handler_context,
UpgradeRegion { UpgradeRegion {
region_id, region_id,
replay_timeout, replay_timeout,
..Default::default() ..Default::default()
}, },
UpgradeRegion { )
region_id: region_id2, .await;
replay_timeout,
..Default::default()
},
],
)
.await;
let replies = &reply.unwrap().expect_upgrade_regions_reply(); let reply = reply.unwrap().expect_upgrade_region_reply();
assert_eq!(replies[0].region_id, region_id);
assert_eq!(replies[1].region_id, region_id2);
for reply in replies {
assert!(!reply.exists); assert!(!reply.exists);
assert!(reply.error.is_none()); assert!(reply.error.is_none());
} }
@@ -274,7 +182,6 @@ mod tests {
async fn test_region_writable() { async fn test_region_writable() {
let mock_region_server = mock_region_server(); let mock_region_server = mock_region_server();
let region_id = RegionId::new(1024, 1); let region_id = RegionId::new(1024, 1);
let region_id2 = RegionId::new(1024, 2);
let (mock_engine, _) = let (mock_engine, _) =
MockRegionEngine::with_custom_apply_fn(MITO_ENGINE_NAME, |region_engine| { MockRegionEngine::with_custom_apply_fn(MITO_ENGINE_NAME, |region_engine| {
@@ -284,32 +191,25 @@ mod tests {
unreachable!(); unreachable!();
})); }));
}); });
mock_region_server.register_test_region(region_id, mock_engine.clone()); mock_region_server.register_test_region(region_id, mock_engine);
mock_region_server.register_test_region(region_id2, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server); let handler_context = HandlerContext::new_for_test(mock_region_server);
let replay_timeout = Duration::from_millis(100u64);
let reply = UpgradeRegionsHandler::new_test() let waits = vec![None, Some(Duration::from_millis(100u64))];
.handle(
&handler_context, for replay_timeout in waits {
vec![ let reply = UpgradeRegionsHandler
.handle(
&handler_context,
UpgradeRegion { UpgradeRegion {
region_id, region_id,
replay_timeout, replay_timeout,
..Default::default() ..Default::default()
}, },
UpgradeRegion { )
region_id: region_id2, .await;
replay_timeout,
..Default::default()
},
],
)
.await;
let replies = &reply.unwrap().expect_upgrade_regions_reply(); let reply = reply.unwrap().expect_upgrade_region_reply();
assert_eq!(replies[0].region_id, region_id);
assert_eq!(replies[1].region_id, region_id2);
for reply in replies {
assert!(reply.ready); assert!(reply.ready);
assert!(reply.exists); assert!(reply.exists);
assert!(reply.error.is_none()); assert!(reply.error.is_none());
@@ -332,27 +232,30 @@ mod tests {
mock_region_server.register_test_region(region_id, mock_engine); mock_region_server.register_test_region(region_id, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server); let handler_context = HandlerContext::new_for_test(mock_region_server);
let replay_timeout = Duration::from_millis(100u64);
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
vec![UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
}],
)
.await;
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; let waits = vec![None, Some(Duration::from_millis(100u64))];
assert!(!reply.ready);
assert!(reply.exists); for replay_timeout in waits {
assert!(reply.error.is_none(), "error: {:?}", reply.error); let reply = UpgradeRegionsHandler
.handle(
&handler_context,
UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
},
)
.await;
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
} }
#[tokio::test] #[tokio::test]
async fn test_region_not_ready_with_retry() { async fn test_region_not_ready_with_retry() {
common_telemetry::init_default_ut_logging();
let mock_region_server = mock_region_server(); let mock_region_server = mock_region_server();
let region_id = RegionId::new(1024, 1); let region_id = RegionId::new(1024, 1);
@@ -361,48 +264,58 @@ mod tests {
// Region is not ready. // Region is not ready.
region_engine.mock_role = Some(Some(RegionRole::Follower)); region_engine.mock_role = Some(Some(RegionRole::Follower));
region_engine.handle_request_mock_fn = Some(Box::new(|_, _| Ok(0))); region_engine.handle_request_mock_fn = Some(Box::new(|_, _| Ok(0)));
// Note: Don't change.
region_engine.handle_request_delay = Some(Duration::from_millis(300)); region_engine.handle_request_delay = Some(Duration::from_millis(300));
}); });
mock_region_server.register_test_region(region_id, mock_engine); mock_region_server.register_test_region(region_id, mock_engine);
let waits = vec![Duration::from_millis(100u64), Duration::from_millis(100u64)];
let waits = vec![
Some(Duration::from_millis(100u64)),
Some(Duration::from_millis(100u64)),
];
let handler_context = HandlerContext::new_for_test(mock_region_server); let handler_context = HandlerContext::new_for_test(mock_region_server);
for replay_timeout in waits { for replay_timeout in waits {
let reply = UpgradeRegionsHandler::new_test() let reply = UpgradeRegionsHandler
.handle( .handle(
&handler_context, &handler_context,
vec![UpgradeRegion { UpgradeRegion {
region_id, region_id,
replay_timeout, replay_timeout,
..Default::default() ..Default::default()
}], },
) )
.await; .await;
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready); assert!(!reply.ready);
assert!(reply.exists); assert!(reply.exists);
assert!(reply.error.is_none(), "error: {:?}", reply.error); assert!(reply.error.is_none());
} }
let reply = UpgradeRegionsHandler::new_test() let timer = Instant::now();
let reply = UpgradeRegionsHandler
.handle( .handle(
&handler_context, &handler_context,
vec![UpgradeRegion { UpgradeRegion {
region_id, region_id,
replay_timeout: Duration::from_millis(500), replay_timeout: Some(Duration::from_millis(500)),
..Default::default() ..Default::default()
}], },
) )
.await; .await;
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; // Must less than 300 ms.
assert!(timer.elapsed().as_millis() < 300);
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(reply.ready); assert!(reply.ready);
assert!(reply.exists); assert!(reply.exists);
assert!(reply.error.is_none(), "error: {:?}", reply.error); assert!(reply.error.is_none());
} }
#[tokio::test] #[tokio::test]
async fn test_region_error() { async fn test_region_error() {
common_telemetry::init_default_ut_logging();
let mock_region_server = mock_region_server(); let mock_region_server = mock_region_server();
let region_id = RegionId::new(1024, 1); let region_id = RegionId::new(1024, 1);
@@ -422,37 +335,38 @@ mod tests {
mock_region_server.register_test_region(region_id, mock_engine); mock_region_server.register_test_region(region_id, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server); let handler_context = HandlerContext::new_for_test(mock_region_server);
let reply = UpgradeRegionsHandler::new_test()
let reply = UpgradeRegionsHandler
.handle( .handle(
&handler_context, &handler_context,
vec![UpgradeRegion { UpgradeRegion {
region_id, region_id,
..Default::default() ..Default::default()
}], },
) )
.await; .await;
// It didn't wait for handle returns; it had no idea about the error. // It didn't wait for handle returns; it had no idea about the error.
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready); assert!(!reply.ready);
assert!(reply.exists); assert!(reply.exists);
assert!(reply.error.is_none()); assert!(reply.error.is_none());
let reply = UpgradeRegionsHandler::new_test() let reply = UpgradeRegionsHandler
.handle( .handle(
&handler_context, &handler_context,
vec![UpgradeRegion { UpgradeRegion {
region_id, region_id,
replay_timeout: Duration::from_millis(200), replay_timeout: Some(Duration::from_millis(200)),
..Default::default() ..Default::default()
}], },
) )
.await; .await;
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready); assert!(!reply.ready);
assert!(reply.exists); assert!(reply.exists);
assert!(reply.error.is_some()); assert!(reply.error.is_some());
assert!(reply.error.as_ref().unwrap().contains("mock_error")); assert!(reply.error.unwrap().contains("mock_error"));
} }
} }

View File

@@ -75,20 +75,4 @@ lazy_static! {
&[RESULT_TYPE] &[RESULT_TYPE]
) )
.unwrap(); .unwrap();
/// Total count of failed region server requests.
pub static ref REGION_SERVER_REQUEST_FAILURE_COUNT: IntCounterVec = register_int_counter_vec!(
"greptime_datanode_region_request_fail_count",
"failed region server requests count",
&[REGION_REQUEST_TYPE]
)
.unwrap();
/// Total count of failed insert requests to region server.
pub static ref REGION_SERVER_INSERT_FAIL_COUNT: IntCounterVec = register_int_counter_vec!(
"greptime_datanode_region_failed_insert_count",
"failed region server insert requests count",
&[REGION_REQUEST_TYPE]
)
.unwrap();
} }

View File

@@ -66,8 +66,7 @@ use store_api::region_engine::{
SettableRegionRoleState, SettableRegionRoleState,
}; };
use store_api::region_request::{ use store_api::region_request::{
AffectedRows, BatchRegionDdlRequest, RegionCatchupRequest, RegionCloseRequest, AffectedRows, BatchRegionDdlRequest, RegionCloseRequest, RegionOpenRequest, RegionRequest,
RegionOpenRequest, RegionRequest,
}; };
use store_api::storage::RegionId; use store_api::storage::RegionId;
use tokio::sync::{Semaphore, SemaphorePermit}; use tokio::sync::{Semaphore, SemaphorePermit};
@@ -192,17 +191,6 @@ impl RegionServer {
.await .await
} }
#[tracing::instrument(skip_all)]
pub async fn handle_batch_catchup_requests(
&self,
parallelism: usize,
requests: Vec<(RegionId, RegionCatchupRequest)>,
) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
self.inner
.handle_batch_catchup_requests(parallelism, requests)
.await
}
#[tracing::instrument(skip_all, fields(request_type = request.request_type()))] #[tracing::instrument(skip_all, fields(request_type = request.request_type()))]
pub async fn handle_request( pub async fn handle_request(
&self, &self,
@@ -411,14 +399,6 @@ impl RegionServer {
#[cfg(test)] #[cfg(test)]
/// Registers a region for test purpose. /// Registers a region for test purpose.
pub(crate) fn register_test_region(&self, region_id: RegionId, engine: RegionEngineRef) { pub(crate) fn register_test_region(&self, region_id: RegionId, engine: RegionEngineRef) {
{
let mut engines = self.inner.engines.write().unwrap();
if !engines.contains_key(engine.name()) {
debug!("Registering test engine: {}", engine.name());
engines.insert(engine.name().to_string(), engine.clone());
}
}
self.inner self.inner
.region_map .region_map
.insert(region_id, RegionEngineWithStatus::Ready(engine)); .insert(region_id, RegionEngineWithStatus::Ready(engine));
@@ -600,8 +580,6 @@ impl RegionServer {
#[async_trait] #[async_trait]
impl RegionServerHandler for RegionServer { impl RegionServerHandler for RegionServer {
async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponseV1> { async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponseV1> {
let failed_requests_cnt = crate::metrics::REGION_SERVER_REQUEST_FAILURE_COUNT
.with_label_values(&[request.as_ref()]);
let response = match &request { let response = match &request {
region_request::Body::Creates(_) region_request::Body::Creates(_)
| region_request::Body::Drops(_) | region_request::Body::Drops(_)
@@ -619,9 +597,6 @@ impl RegionServerHandler for RegionServer {
_ => self.handle_requests_in_serial(request).await, _ => self.handle_requests_in_serial(request).await,
} }
.map_err(BoxedError::new) .map_err(BoxedError::new)
.inspect_err(|_| {
failed_requests_cnt.inc();
})
.context(ExecuteGrpcRequestSnafu)?; .context(ExecuteGrpcRequestSnafu)?;
Ok(RegionResponseV1 { Ok(RegionResponseV1 {
@@ -997,116 +972,6 @@ impl RegionServerInner {
.collect::<Vec<_>>()) .collect::<Vec<_>>())
} }
pub async fn handle_batch_catchup_requests_inner(
&self,
engine: RegionEngineRef,
parallelism: usize,
requests: Vec<(RegionId, RegionCatchupRequest)>,
) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
for (region_id, _) in &requests {
self.set_region_status_not_ready(*region_id, &engine, &RegionChange::Catchup);
}
let region_ids = requests
.iter()
.map(|(region_id, _)| *region_id)
.collect::<Vec<_>>();
let mut responses = Vec::with_capacity(requests.len());
match engine
.handle_batch_catchup_requests(parallelism, requests)
.await
{
Ok(results) => {
for (region_id, result) in results {
match result {
Ok(_) => {
if let Err(e) = self
.set_region_status_ready(
region_id,
engine.clone(),
RegionChange::Catchup,
)
.await
{
error!(e; "Failed to set region to ready: {}", region_id);
responses.push((region_id, Err(BoxedError::new(e))));
} else {
responses.push((region_id, Ok(())));
}
}
Err(e) => {
self.unset_region_status(region_id, &engine, RegionChange::Catchup);
error!(e; "Failed to catchup region: {}", region_id);
responses.push((region_id, Err(e)));
}
}
}
}
Err(e) => {
for region_id in region_ids {
self.unset_region_status(region_id, &engine, RegionChange::Catchup);
}
error!(e; "Failed to catchup batch regions");
return error::UnexpectedSnafu {
violated: format!("Failed to catchup batch regions: {:?}", e),
}
.fail();
}
}
Ok(responses)
}
pub async fn handle_batch_catchup_requests(
&self,
parallelism: usize,
requests: Vec<(RegionId, RegionCatchupRequest)>,
) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
let mut engine_grouped_requests: HashMap<String, Vec<_>> = HashMap::new();
let mut responses = Vec::with_capacity(requests.len());
for (region_id, request) in requests {
if let Ok(engine) = self.get_engine(region_id, &RegionChange::Catchup) {
match engine {
CurrentEngine::Engine(engine) => {
engine_grouped_requests
.entry(engine.name().to_string())
.or_default()
.push((region_id, request));
}
CurrentEngine::EarlyReturn(_) => {
return error::UnexpectedSnafu {
violated: format!("Unexpected engine type for region {}", region_id),
}
.fail();
}
}
} else {
responses.push((
region_id,
Err(BoxedError::new(
error::RegionNotFoundSnafu { region_id }.build(),
)),
));
}
}
for (engine, requests) in engine_grouped_requests {
let engine = self
.engines
.read()
.unwrap()
.get(&engine)
.with_context(|| RegionEngineNotFoundSnafu { name: &engine })?
.clone();
responses.extend(
self.handle_batch_catchup_requests_inner(engine, parallelism, requests)
.await?,
);
}
Ok(responses)
}
// Handle requests in batch. // Handle requests in batch.
// //
// limitation: all create requests must be in the same engine. // limitation: all create requests must be in the same engine.
@@ -1235,11 +1100,6 @@ impl RegionServerInner {
}) })
} }
Err(err) => { Err(err) => {
if matches!(region_change, RegionChange::Ingest) {
crate::metrics::REGION_SERVER_INSERT_FAIL_COUNT
.with_label_values(&[request_type])
.inc();
}
// Removes the region status if the operation fails. // Removes the region status if the operation fails.
self.unset_region_status(region_id, &engine, region_change); self.unset_region_status(region_id, &engine, region_change);
Err(err) Err(err)

View File

@@ -277,10 +277,6 @@ impl ConcreteDataType {
matches!(self, ConcreteDataType::Null(NullType)) matches!(self, ConcreteDataType::Null(NullType))
} }
pub(crate) fn is_struct(&self) -> bool {
matches!(self, ConcreteDataType::Struct(_))
}
/// Try to cast the type as a [`ListType`]. /// Try to cast the type as a [`ListType`].
pub fn as_list(&self) -> Option<&ListType> { pub fn as_list(&self) -> Option<&ListType> {
match self { match self {

View File

@@ -266,14 +266,6 @@ pub enum Error {
#[snafu(implicit)] #[snafu(implicit)]
location: Location, location: Location,
}, },
#[snafu(display("Failed to parse or serialize arrow metadata"))]
ArrowMetadata {
#[snafu(source)]
error: arrow::error::ArrowError,
#[snafu(implicit)]
location: Location,
},
} }
impl ErrorExt for Error { impl ErrorExt for Error {
@@ -315,8 +307,7 @@ impl ErrorExt for Error {
| ConvertArrowArrayToScalars { .. } | ConvertArrowArrayToScalars { .. }
| ConvertScalarToArrowArray { .. } | ConvertScalarToArrowArray { .. }
| ParseExtendedType { .. } | ParseExtendedType { .. }
| InconsistentStructFieldsAndItems { .. } | InconsistentStructFieldsAndItems { .. } => StatusCode::Internal,
| ArrowMetadata { .. } => StatusCode::Internal,
} }
} }

View File

@@ -1,15 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod json;

View File

@@ -1,104 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType};
use serde::{Deserialize, Serialize};
use crate::json::JsonStructureSettings;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct JsonMetadata {
/// Indicates how to handle JSON is stored in underlying data type
///
/// This field can be `None` for data is converted to complete structured in-memory form.
pub json_structure_settings: Option<JsonStructureSettings>,
}
#[derive(Debug, Clone)]
pub struct JsonExtensionType(Arc<JsonMetadata>);
impl JsonExtensionType {
pub fn new(metadata: Arc<JsonMetadata>) -> Self {
JsonExtensionType(metadata)
}
}
impl ExtensionType for JsonExtensionType {
const NAME: &'static str = "greptime.json";
type Metadata = Arc<JsonMetadata>;
fn metadata(&self) -> &Self::Metadata {
&self.0
}
fn serialize_metadata(&self) -> Option<String> {
serde_json::to_string(self.metadata()).ok()
}
fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
if let Some(metadata) = metadata {
let metadata = serde_json::from_str(metadata).map_err(|e| {
ArrowError::ParseError(format!("Failed to deserialize JSON metadata: {}", e))
})?;
Ok(Arc::new(metadata))
} else {
Ok(Arc::new(JsonMetadata::default()))
}
}
fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
match data_type {
// object
DataType::Struct(_)
// array
| DataType::List(_)
| DataType::ListView(_)
| DataType::LargeList(_)
| DataType::LargeListView(_)
// string
| DataType::Utf8
| DataType::Utf8View
| DataType::LargeUtf8
// number
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Float32
| DataType::Float64
// boolean
| DataType::Boolean
// null
| DataType::Null
// legacy json type
| DataType::Binary => Ok(()),
dt => Err(ArrowError::SchemaError(format!(
"Unexpected data type {dt}"
))),
}
}
fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
let json = Self(metadata);
json.supports_data_type(data_type)?;
Ok(json)
}
}

View File

@@ -13,13 +13,11 @@
// limitations under the License. // limitations under the License.
#![feature(assert_matches)] #![feature(assert_matches)]
#![feature(box_patterns)]
pub mod arrow_array; pub mod arrow_array;
pub mod data_type; pub mod data_type;
pub mod duration; pub mod duration;
pub mod error; pub mod error;
pub mod extension;
pub mod interval; pub mod interval;
pub mod json; pub mod json;
pub mod macros; pub mod macros;

View File

@@ -32,8 +32,9 @@ pub use crate::schema::column_schema::{
COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_FULLTEXT_OPT_KEY_GRANULARITY, COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_FULLTEXT_OPT_KEY_GRANULARITY,
COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY, COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY, COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata, FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY,
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, JSON_STRUCTURE_SETTINGS_KEY, Metadata, SKIPPING_INDEX_KEY, SkippingIndexOptions,
SkippingIndexType, TIME_INDEX_KEY,
}; };
pub use crate::schema::constraint::ColumnDefaultConstraint; pub use crate::schema::constraint::ColumnDefaultConstraint;
pub use crate::schema::raw::RawSchema; pub use crate::schema::raw::RawSchema;

View File

@@ -17,17 +17,13 @@ use std::fmt;
use std::str::FromStr; use std::str::FromStr;
use arrow::datatypes::Field; use arrow::datatypes::Field;
use arrow_schema::extension::{
EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY, ExtensionType,
};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use snafu::{ResultExt, ensure}; use snafu::{ResultExt, ensure};
use sqlparser_derive::{Visit, VisitMut}; use sqlparser_derive::{Visit, VisitMut};
use crate::data_type::{ConcreteDataType, DataType}; use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{ use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result};
self, ArrowMetadataSnafu, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result, use crate::json::JsonStructureSettings;
};
use crate::schema::TYPE_KEY; use crate::schema::TYPE_KEY;
use crate::schema::constraint::ColumnDefaultConstraint; use crate::schema::constraint::ColumnDefaultConstraint;
use crate::value::Value; use crate::value::Value;
@@ -46,6 +42,7 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index"; pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
/// Key used to store skip options in arrow field's metadata. /// Key used to store skip options in arrow field's metadata.
pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index"; pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
pub const JSON_STRUCTURE_SETTINGS_KEY: &str = "greptime:json:structure_settings";
/// Keys used in fulltext options /// Keys used in fulltext options
pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable"; pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -397,38 +394,18 @@ impl ColumnSchema {
Ok(()) Ok(())
} }
pub fn extension_type<E>(&self) -> Result<Option<E>> pub fn json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
where self.metadata
E: ExtensionType, .get(JSON_STRUCTURE_SETTINGS_KEY)
{ .map(|json| serde_json::from_str(json).context(error::DeserializeSnafu { json }))
let extension_type_name = self.metadata.get(EXTENSION_TYPE_NAME_KEY); .transpose()
if extension_type_name.map(|s| s.as_str()) == Some(E::NAME) {
let extension_metadata = self.metadata.get(EXTENSION_TYPE_METADATA_KEY);
let extension_metadata =
E::deserialize_metadata(extension_metadata.map(|s| s.as_str()))
.context(ArrowMetadataSnafu)?;
let extension = E::try_new(&self.data_type.as_arrow_type(), extension_metadata)
.context(ArrowMetadataSnafu)?;
Ok(Some(extension))
} else {
Ok(None)
}
} }
pub fn with_extension_type<E>(&mut self, extension_type: &E) -> Result<()> pub fn with_json_structure_settings(&mut self, settings: &JsonStructureSettings) -> Result<()> {
where self.metadata.insert(
E: ExtensionType, JSON_STRUCTURE_SETTINGS_KEY.to_string(),
{ serde_json::to_string(settings).context(error::SerializeSnafu)?,
self.metadata );
.insert(EXTENSION_TYPE_NAME_KEY.to_string(), E::NAME.to_string());
if let Some(extension_metadata) = extension_type.serialize_metadata() {
self.metadata
.insert(EXTENSION_TYPE_METADATA_KEY.to_string(), extension_metadata);
}
Ok(()) Ok(())
} }
} }

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use std::collections::{BTreeMap, HashMap}; use std::collections::BTreeMap;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
@@ -31,12 +31,9 @@ use crate::scalars::ScalarVectorBuilder;
use crate::type_id::LogicalTypeId; use crate::type_id::LogicalTypeId;
use crate::types::{ListType, StructField, StructType}; use crate::types::{ListType, StructField, StructType};
use crate::value::Value; use crate::value::Value;
use crate::vectors::json::builder::JsonVectorBuilder;
use crate::vectors::{BinaryVectorBuilder, MutableVector}; use crate::vectors::{BinaryVectorBuilder, MutableVector};
pub const JSON_TYPE_NAME: &str = "Json"; pub const JSON_TYPE_NAME: &str = "Json";
const JSON_PLAIN_FIELD_NAME: &str = "__plain__";
const JSON_PLAIN_FIELD_METADATA_KEY: &str = "is_plain_json";
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default)]
pub enum JsonFormat { pub enum JsonFormat {
@@ -57,46 +54,28 @@ impl JsonType {
Self { format } Self { format }
} }
pub(crate) fn empty() -> Self { // TODO(LFC): remove "allow unused"
Self { #[allow(unused)]
format: JsonFormat::Native(Box::new(ConcreteDataType::null_datatype())),
}
}
/// Make json type a struct type, by: /// Make json type a struct type, by:
/// - if the json is an object, its entries are mapped to struct fields, obviously; /// - if the json is an object, its entries are mapped to struct fields, obviously;
/// - if not, the json is one of bool, number, string or array, make it a special field called /// - if not, the json is one of bool, number, string or array, make it a special field called
/// [JSON_PLAIN_FIELD_NAME] with metadata [JSON_PLAIN_FIELD_METADATA_KEY] = `"true"` in a /// "__plain" in a struct with only that field.
/// struct with only that field.
pub(crate) fn as_struct_type(&self) -> StructType { pub(crate) fn as_struct_type(&self) -> StructType {
match &self.format { match &self.format {
JsonFormat::Jsonb => StructType::default(), JsonFormat::Jsonb => StructType::default(),
JsonFormat::Native(inner) => match inner.as_ref() { JsonFormat::Native(inner) => match inner.as_ref() {
ConcreteDataType::Struct(t) => t.clone(), ConcreteDataType::Struct(t) => t.clone(),
x => { x => StructType::new(Arc::new(vec![StructField::new(
let mut field = "__plain".to_string(),
StructField::new(JSON_PLAIN_FIELD_NAME.to_string(), x.clone(), true); x.clone(),
field.insert_metadata(JSON_PLAIN_FIELD_METADATA_KEY, true); true,
StructType::new(Arc::new(vec![field])) )])),
}
}, },
} }
} }
/// Check if this json type is the special "plain" one. // TODO(LFC): remove "allow unused"
/// See [JsonType::as_struct_type]. #[allow(unused)]
pub(crate) fn is_plain_json(&self) -> bool {
let JsonFormat::Native(box ConcreteDataType::Struct(t)) = &self.format else {
return true;
};
let fields = t.fields();
let Some((single, [])) = fields.split_first() else {
return false;
};
single.name() == JSON_PLAIN_FIELD_NAME
&& single.metadata(JSON_PLAIN_FIELD_METADATA_KEY) == Some("true")
}
/// Try to merge this json type with others, error on datatype conflict. /// Try to merge this json type with others, error on datatype conflict.
pub(crate) fn merge(&mut self, other: &JsonType) -> Result<()> { pub(crate) fn merge(&mut self, other: &JsonType) -> Result<()> {
match (&self.format, &other.format) { match (&self.format, &other.format) {
@@ -112,47 +91,6 @@ impl JsonType {
.fail(), .fail(),
} }
} }
pub(crate) fn is_mergeable(&self, other: &JsonType) -> bool {
match (&self.format, &other.format) {
(JsonFormat::Jsonb, JsonFormat::Jsonb) => true,
(JsonFormat::Native(this), JsonFormat::Native(that)) => {
is_mergeable(this.as_ref(), that.as_ref())
}
_ => false,
}
}
}
fn is_mergeable(this: &ConcreteDataType, that: &ConcreteDataType) -> bool {
fn is_mergeable_struct(this: &StructType, that: &StructType) -> bool {
let this_fields = this.fields();
let this_fields = this_fields
.iter()
.map(|x| (x.name(), x))
.collect::<HashMap<_, _>>();
for that_field in that.fields().iter() {
if let Some(this_field) = this_fields.get(that_field.name())
&& !is_mergeable(this_field.data_type(), that_field.data_type())
{
return false;
}
}
true
}
match (this, that) {
(this, that) if this == that => true,
(ConcreteDataType::List(this), ConcreteDataType::List(that)) => {
is_mergeable(this.item_type(), that.item_type())
}
(ConcreteDataType::Struct(this), ConcreteDataType::Struct(that)) => {
is_mergeable_struct(this, that)
}
(ConcreteDataType::Null(_), _) | (_, ConcreteDataType::Null(_)) => true,
_ => false,
}
} }
fn merge(this: &ConcreteDataType, that: &ConcreteDataType) -> Result<ConcreteDataType> { fn merge(this: &ConcreteDataType, that: &ConcreteDataType) -> Result<ConcreteDataType> {
@@ -228,10 +166,7 @@ impl DataType for JsonType {
} }
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> { fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
match self.format { Box::new(BinaryVectorBuilder::with_capacity(capacity))
JsonFormat::Jsonb => Box::new(BinaryVectorBuilder::with_capacity(capacity)),
JsonFormat::Native(_) => Box::new(JsonVectorBuilder::with_capacity(capacity)),
}
} }
fn try_cast(&self, from: Value) -> Option<Value> { fn try_cast(&self, from: Value) -> Option<Value> {
@@ -291,12 +226,10 @@ mod tests {
let result = json_type.merge(other); let result = json_type.merge(other);
match (result, expected) { match (result, expected) {
(Ok(()), Ok(expected)) => { (Ok(()), Ok(expected)) => {
assert_eq!(json_type.name(), expected); assert_eq!(json_type.name(), expected)
assert!(json_type.is_mergeable(other));
} }
(Err(err), Err(expected)) => { (Err(err), Err(expected)) => {
assert_eq!(err.to_string(), expected); assert_eq!(err.to_string(), expected)
assert!(!json_type.is_mergeable(other));
} }
_ => unreachable!(), _ => unreachable!(),
} }

View File

@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use std::collections::BTreeMap;
use std::sync::Arc; use std::sync::Arc;
use arrow::datatypes::{DataType as ArrowDataType, Field}; use arrow::datatypes::{DataType as ArrowDataType, Field};
@@ -47,15 +46,6 @@ impl TryFrom<&Fields> for StructType {
} }
} }
impl<const N: usize> From<[StructField; N]> for StructType {
fn from(value: [StructField; N]) -> Self {
let value: Box<[StructField]> = Box::new(value);
Self {
fields: Arc::new(value.into_vec()),
}
}
}
impl DataType for StructType { impl DataType for StructType {
fn name(&self) -> String { fn name(&self) -> String {
format!( format!(
@@ -118,7 +108,6 @@ pub struct StructField {
name: String, name: String,
data_type: ConcreteDataType, data_type: ConcreteDataType,
nullable: bool, nullable: bool,
metadata: BTreeMap<String, String>,
} }
impl StructField { impl StructField {
@@ -127,7 +116,6 @@ impl StructField {
name, name,
data_type, data_type,
nullable, nullable,
metadata: BTreeMap::new(),
} }
} }
@@ -147,25 +135,11 @@ impl StructField {
self.nullable self.nullable
} }
pub(crate) fn insert_metadata(&mut self, key: impl ToString, value: impl ToString) {
self.metadata.insert(key.to_string(), value.to_string());
}
pub(crate) fn metadata(&self, key: &str) -> Option<&str> {
self.metadata.get(key).map(String::as_str)
}
pub fn to_df_field(&self) -> Field { pub fn to_df_field(&self) -> Field {
let metadata = self
.metadata
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
Field::new( Field::new(
self.name.clone(), self.name.clone(),
self.data_type.as_arrow_type(), self.data_type.as_arrow_type(),
self.nullable, self.nullable,
) )
.with_metadata(metadata)
} }
} }

View File

@@ -873,12 +873,6 @@ impl From<&[u8]> for Value {
} }
} }
impl From<()> for Value {
fn from(_: ()) -> Self {
Value::Null
}
}
impl TryFrom<Value> for serde_json::Value { impl TryFrom<Value> for serde_json::Value {
type Error = serde_json::Error; type Error = serde_json::Error;

View File

@@ -35,7 +35,6 @@ mod duration;
mod eq; mod eq;
mod helper; mod helper;
mod interval; mod interval;
pub(crate) mod json;
mod list; mod list;
mod null; mod null;
pub(crate) mod operations; pub(crate) mod operations;

View File

@@ -464,14 +464,6 @@ impl Helper {
} }
} }
#[cfg(test)]
pub(crate) fn pretty_print(vector: VectorRef) -> String {
let array = vector.to_arrow_array();
arrow::util::pretty::pretty_format_columns(&vector.vector_type_name(), &[array])
.map(|x| x.to_string())
.unwrap_or_else(|e| e.to_string())
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use arrow::array::{ use arrow::array::{

View File

@@ -1,15 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub(crate) mod builder;

View File

@@ -1,485 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::collections::HashMap;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{Result, TryFromValueSnafu, UnsupportedOperationSnafu};
use crate::prelude::{ValueRef, Vector, VectorRef};
use crate::types::JsonType;
use crate::value::StructValueRef;
use crate::vectors::{MutableVector, StructVectorBuilder};
struct JsonStructsBuilder {
json_type: JsonType,
inner: StructVectorBuilder,
}
impl JsonStructsBuilder {
fn new(json_type: JsonType, capacity: usize) -> Self {
let struct_type = json_type.as_struct_type();
let inner = StructVectorBuilder::with_type_and_capacity(struct_type, capacity);
Self { json_type, inner }
}
fn len(&self) -> usize {
self.inner.len()
}
fn push(&mut self, value: &ValueRef) -> Result<()> {
if self.json_type.is_plain_json() {
let value = ValueRef::Struct(StructValueRef::RefList {
val: vec![value.clone()],
fields: self.json_type.as_struct_type(),
});
self.inner.try_push_value_ref(&value)
} else {
self.inner.try_push_value_ref(value)
}
}
/// Try to merge (and consume the data of) other json vector builder into this one.
/// Note that the other builder's json type must be able to be merged with this one's
/// (this one's json type has all the fields in other one's, and no datatypes conflict).
/// Normally this is guaranteed, as long as json values are pushed through [JsonVectorBuilder].
fn try_merge(&mut self, other: &mut JsonStructsBuilder) -> Result<()> {
debug_assert!(self.json_type.is_mergeable(&other.json_type));
fn helper(this: &mut StructVectorBuilder, that: &mut StructVectorBuilder) -> Result<()> {
let that_len = that.len();
if let Some(x) = that.mut_null_buffer().finish() {
this.mut_null_buffer().append_buffer(&x)
} else {
this.mut_null_buffer().append_n_non_nulls(that_len);
}
let that_fields = that.struct_type().fields();
let mut that_builders = that_fields
.iter()
.zip(that.mut_value_builders().iter_mut())
.map(|(field, builder)| (field.name(), builder))
.collect::<HashMap<_, _>>();
for (field, this_builder) in this
.struct_type()
.fields()
.iter()
.zip(this.mut_value_builders().iter_mut())
{
if let Some(that_builder) = that_builders.get_mut(field.name()) {
if field.data_type().is_struct() {
let this = this_builder
.as_mut_any()
.downcast_mut::<StructVectorBuilder>()
// Safety: a struct datatype field must be corresponding to a struct vector builder.
.unwrap();
let that = that_builder
.as_mut_any()
.downcast_mut::<StructVectorBuilder>()
// Safety: other builder with same field name must have same datatype,
// ensured because the two json types are mergeable.
.unwrap();
helper(this, that)?;
} else {
let vector = that_builder.to_vector();
this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?;
}
} else {
this_builder.push_nulls(that_len);
}
}
Ok(())
}
helper(&mut self.inner, &mut other.inner)
}
/// Same as [JsonStructsBuilder::try_merge], but does not consume the other builder's data.
fn try_merge_cloned(&mut self, other: &JsonStructsBuilder) -> Result<()> {
debug_assert!(self.json_type.is_mergeable(&other.json_type));
fn helper(this: &mut StructVectorBuilder, that: &StructVectorBuilder) -> Result<()> {
let that_len = that.len();
if let Some(x) = that.null_buffer().finish_cloned() {
this.mut_null_buffer().append_buffer(&x)
} else {
this.mut_null_buffer().append_n_non_nulls(that_len);
}
let that_fields = that.struct_type().fields();
let that_builders = that_fields
.iter()
.zip(that.value_builders().iter())
.map(|(field, builder)| (field.name(), builder))
.collect::<HashMap<_, _>>();
for (field, this_builder) in this
.struct_type()
.fields()
.iter()
.zip(this.mut_value_builders().iter_mut())
{
if let Some(that_builder) = that_builders.get(field.name()) {
if field.data_type().is_struct() {
let this = this_builder
.as_mut_any()
.downcast_mut::<StructVectorBuilder>()
// Safety: a struct datatype field must be corresponding to a struct vector builder.
.unwrap();
let that = that_builder
.as_any()
.downcast_ref::<StructVectorBuilder>()
// Safety: other builder with same field name must have same datatype,
// ensured because the two json types are mergeable.
.unwrap();
helper(this, that)?;
} else {
let vector = that_builder.to_vector_cloned();
this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?;
}
} else {
this_builder.push_nulls(that_len);
}
}
Ok(())
}
helper(&mut self.inner, &other.inner)
}
}
/// The vector builder for json type values.
///
/// Json type are dynamic, to some degree (as long as they can be merged into each other). So are
/// json values. Json values are physically stored in struct vectors, which require the types of
/// struct values to be fixed inside a certain struct vector. So to resolve "dynamic" vs "fixed"
/// datatype problem, in this builder, each type of json value gets its own struct vector builder.
/// Once new json type value is pushing into this builder, it creates a new "child" builder for it.
///
/// Given the "mixed" nature of the values stored in this builder, to produce the json vector, a
/// "merge" operation is performed. The "merge" is to iterate over all the "child" builders, and fill
/// nulls for missing json fields. The final vector's json type is fixed to be the "merge" of all
/// pushed json types.
pub(crate) struct JsonVectorBuilder {
merged_type: JsonType,
capacity: usize,
builders: Vec<JsonStructsBuilder>,
}
impl JsonVectorBuilder {
pub(crate) fn with_capacity(capacity: usize) -> Self {
Self {
merged_type: JsonType::empty(),
capacity,
builders: vec![],
}
}
fn try_create_new_builder(&mut self, json_type: &JsonType) -> Result<&mut JsonStructsBuilder> {
self.merged_type.merge(json_type)?;
let builder = JsonStructsBuilder::new(json_type.clone(), self.capacity);
self.builders.push(builder);
let len = self.builders.len();
Ok(&mut self.builders[len - 1])
}
}
impl MutableVector for JsonVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::Json(self.merged_type.clone())
}
fn len(&self) -> usize {
self.builders.iter().map(|x| x.len()).sum()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
// Fast path:
if self.builders.len() == 1 {
return self.builders[0].inner.to_vector();
}
let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity);
for builder in self.builders.iter_mut() {
unified_jsons
.try_merge(builder)
// Safety: the "unified_jsons" has the merged json type from all the builders,
// so it should merge them without errors.
.unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}"));
}
unified_jsons.inner.to_vector()
}
fn to_vector_cloned(&self) -> VectorRef {
// Fast path:
if self.builders.len() == 1 {
return self.builders[0].inner.to_vector_cloned();
}
let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity);
for builder in self.builders.iter() {
unified_jsons
.try_merge_cloned(builder)
// Safety: the "unified_jsons" has the merged json type from all the builders,
// so it should merge them without errors.
.unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}"));
}
unified_jsons.inner.to_vector_cloned()
}
fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
let data_type = value.data_type();
let json_type = data_type.as_json().with_context(|| TryFromValueSnafu {
reason: format!("expected json value, got {value:?}"),
})?;
let builder = match self.builders.last_mut() {
Some(last) => {
if &last.json_type != json_type {
self.try_create_new_builder(json_type)?
} else {
last
}
}
None => self.try_create_new_builder(json_type)?,
};
let ValueRef::Json(value) = value else {
// Safety: json datatype value must be the value of json.
unreachable!()
};
builder.push(value)
}
fn push_null(&mut self) {
let null_json_value = ValueRef::Json(Box::new(ValueRef::Null));
self.try_push_value_ref(&null_json_value)
// Safety: learning from the method "try_push_value_ref", a null json value should be
// always able to push into any json vectors.
.unwrap_or_else(|e| {
panic!("failed to push null json value: {null_json_value:?}, error: {e}")
});
}
fn extend_slice_of(&mut self, _: &dyn Vector, _: usize, _: usize) -> Result<()> {
UnsupportedOperationSnafu {
op: "extend_slice_of",
vector_type: "JsonVector",
}
.fail()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::data_type::DataType;
use crate::json::JsonStructureSettings;
use crate::vectors::helper::pretty_print;
fn push(json: &str, builder: &mut JsonVectorBuilder, expected: std::result::Result<(), &str>) {
let settings = JsonStructureSettings::Structured(None);
let json: serde_json::Value = serde_json::from_str(json).unwrap();
let value = settings.encode(json).unwrap();
let value = value.as_value_ref();
let result = builder.try_push_value_ref(&value);
match (result, expected) {
(Ok(()), Ok(())) => (),
(Err(e), Err(expected)) => assert_eq!(e.to_string(), expected),
_ => unreachable!(),
}
}
#[test]
fn test_push_plain_jsons() -> Result<()> {
let jsons = vec!["1", "2", r#""s""#, "[true]"];
let results = vec![
Ok(()),
Ok(()),
Err(
"Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: String",
),
Err(
"Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: List<Boolean>",
),
];
let mut builder = JsonVectorBuilder::with_capacity(1);
for (json, result) in jsons.into_iter().zip(results.into_iter()) {
push(json, &mut builder, result);
}
let vector = builder.to_vector();
let expected = r#"
+----------------+
| StructVector |
+----------------+
| {__plain__: 1} |
| {__plain__: 2} |
+----------------+"#;
assert_eq!(pretty_print(vector), expected.trim());
Ok(())
}
#[test]
fn test_push_json_objects() -> Result<()> {
let jsons = vec![
r#"{
"s": "a",
"list": [1, 2, 3]
}"#,
r#"{
"list": [4],
"s": "b"
}"#,
r#"{
"s": "c",
"float": 0.9
}"#,
r#"{
"float": 0.8,
"s": "d"
}"#,
r#"{
"float": 0.7,
"int": -1
}"#,
r#"{
"int": 0,
"float": 0.6
}"#,
r#"{
"int": 1,
"object": {"hello": "world", "timestamp": 1761523200000}
}"#,
r#"{
"object": {"hello": "greptime", "timestamp": 1761523201000},
"int": 2
}"#,
r#"{
"object": {"timestamp": 1761523202000},
"nested": {"a": {"b": {"b": {"a": "abba"}}}}
}"#,
r#"{
"nested": {"a": {"b": {"a": {"b": "abab"}}}},
"object": {"timestamp": 1761523203000}
}"#,
];
let mut builder = JsonVectorBuilder::with_capacity(1);
for json in jsons {
push(json, &mut builder, Ok(()));
}
assert_eq!(builder.len(), 10);
// test children builders:
assert_eq!(builder.builders.len(), 6);
let expect_types = [
r#"Json<Struct<"list": List<Int64>, "s": String>>"#,
r#"Json<Struct<"float": Float64, "s": String>>"#,
r#"Json<Struct<"float": Float64, "int": Int64>>"#,
r#"Json<Struct<"int": Int64, "object": Struct<"hello": String, "timestamp": Int64>>>"#,
r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"b": Struct<"a": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
];
let expect_vectors = [
r#"
+-------------------------+
| StructVector |
+-------------------------+
| {list: [1, 2, 3], s: a} |
| {list: [4], s: b} |
+-------------------------+"#,
r#"
+--------------------+
| StructVector |
+--------------------+
| {float: 0.9, s: c} |
| {float: 0.8, s: d} |
+--------------------+"#,
r#"
+-----------------------+
| StructVector |
+-----------------------+
| {float: 0.7, int: -1} |
| {float: 0.6, int: 0} |
+-----------------------+"#,
r#"
+---------------------------------------------------------------+
| StructVector |
+---------------------------------------------------------------+
| {int: 1, object: {hello: world, timestamp: 1761523200000}} |
| {int: 2, object: {hello: greptime, timestamp: 1761523201000}} |
+---------------------------------------------------------------+"#,
r#"
+------------------------------------------------------------------------+
| StructVector |
+------------------------------------------------------------------------+
| {nested: {a: {b: {b: {a: abba}}}}, object: {timestamp: 1761523202000}} |
+------------------------------------------------------------------------+"#,
r#"
+------------------------------------------------------------------------+
| StructVector |
+------------------------------------------------------------------------+
| {nested: {a: {b: {a: {b: abab}}}}, object: {timestamp: 1761523203000}} |
+------------------------------------------------------------------------+"#,
];
for (builder, (expect_type, expect_vector)) in builder
.builders
.iter()
.zip(expect_types.into_iter().zip(expect_vectors.into_iter()))
{
assert_eq!(builder.json_type.name(), expect_type);
let vector = builder.inner.to_vector_cloned();
assert_eq!(pretty_print(vector), expect_vector.trim());
}
// test final merged json type:
let expected = r#"Json<Struct<"float": Float64, "int": Int64, "list": List<Int64>, "nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>, "b": Struct<"a": String>>>>, "object": Struct<"hello": String, "timestamp": Int64>, "s": String>>"#;
assert_eq!(builder.data_type().to_string(), expected);
// test final produced vector:
let expected = r#"
+-------------------------------------------------------------------------------------------------------------------+
| StructVector |
+-------------------------------------------------------------------------------------------------------------------+
| {float: , int: , list: [1, 2, 3], nested: , object: , s: a} |
| {float: , int: , list: [4], nested: , object: , s: b} |
| {float: 0.9, int: , list: , nested: , object: , s: c} |
| {float: 0.8, int: , list: , nested: , object: , s: d} |
| {float: 0.7, int: -1, list: , nested: , object: , s: } |
| {float: 0.6, int: 0, list: , nested: , object: , s: } |
| {float: , int: 1, list: , nested: , object: {hello: world, timestamp: 1761523200000}, s: } |
| {float: , int: 2, list: , nested: , object: {hello: greptime, timestamp: 1761523201000}, s: } |
| {float: , int: , list: , nested: {a: {b: {a: , b: {a: abba}}}}, object: {hello: , timestamp: 1761523202000}, s: } |
| {float: , int: , list: , nested: {a: {b: {a: {b: abab}, b: }}}, object: {hello: , timestamp: 1761523203000}, s: } |
+-------------------------------------------------------------------------------------------------------------------+"#;
let vector = builder.to_vector_cloned();
assert_eq!(pretty_print(vector), expected.trim());
let vector = builder.to_vector();
assert_eq!(pretty_print(vector), expected.trim());
Ok(())
}
}

View File

@@ -323,26 +323,6 @@ impl StructVectorBuilder {
} }
self.null_buffer.append_null(); self.null_buffer.append_null();
} }
pub(crate) fn struct_type(&self) -> &StructType {
&self.fields
}
pub(crate) fn value_builders(&self) -> &[Box<dyn MutableVector>] {
&self.value_builders
}
pub(crate) fn mut_value_builders(&mut self) -> &mut [Box<dyn MutableVector>] {
&mut self.value_builders
}
pub(crate) fn null_buffer(&self) -> &NullBufferBuilder {
&self.null_buffer
}
pub(crate) fn mut_null_buffer(&mut self) -> &mut NullBufferBuilder {
&mut self.null_buffer
}
} }
impl MutableVector for StructVectorBuilder { impl MutableVector for StructVectorBuilder {

View File

@@ -21,7 +21,6 @@ use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime}; use std::time::{Duration, Instant, SystemTime};
use api::v1::{RowDeleteRequest, RowDeleteRequests, RowInsertRequest, RowInsertRequests}; use api::v1::{RowDeleteRequest, RowDeleteRequests, RowInsertRequest, RowInsertRequests};
use common_base::memory_limit::MemoryLimit;
use common_config::Configurable; use common_config::Configurable;
use common_error::ext::BoxedError; use common_error::ext::BoxedError;
use common_meta::key::TableMetadataManagerRef; use common_meta::key::TableMetadataManagerRef;
@@ -133,7 +132,6 @@ impl Default for FlownodeOptions {
query: QueryOptions { query: QueryOptions {
parallelism: 1, parallelism: 1,
allow_query_fallback: false, allow_query_fallback: false,
memory_pool_size: MemoryLimit::default(),
}, },
user_provider: None, user_provider: None,
memory: MemoryOptions::default(), memory: MemoryOptions::default(),

View File

@@ -23,7 +23,7 @@ use api::v1::query_request::Query;
use api::v1::{CreateTableExpr, QueryRequest}; use api::v1::{CreateTableExpr, QueryRequest};
use client::{Client, Database}; use client::{Client, Database};
use common_error::ext::{BoxedError, ErrorExt}; use common_error::ext::{BoxedError, ErrorExt};
use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_tls_config}; use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
use common_meta::cluster::{NodeInfo, NodeInfoKey, Role}; use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
use common_meta::peer::Peer; use common_meta::peer::Peer;
use common_meta::rpc::store::RangeRequest; use common_meta::rpc::store::RangeRequest;
@@ -123,10 +123,12 @@ impl FrontendClient {
let cfg = ChannelConfig::new() let cfg = ChannelConfig::new()
.connect_timeout(batch_opts.grpc_conn_timeout) .connect_timeout(batch_opts.grpc_conn_timeout)
.timeout(batch_opts.query_timeout); .timeout(batch_opts.query_timeout);
if let Some(tls) = &batch_opts.frontend_tls {
let tls_config = load_tls_config(batch_opts.frontend_tls.as_ref()) let cfg = cfg.client_tls_config(tls.clone());
.context(InvalidClientConfigSnafu)?; ChannelManager::with_tls_config(cfg).context(InvalidClientConfigSnafu)?
ChannelManager::with_config(cfg, tls_config) } else {
ChannelManager::with_config(cfg)
}
}, },
auth, auth,
query, query,

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use std::collections::{HashMap, HashSet}; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use async_trait::async_trait; use async_trait::async_trait;
@@ -28,7 +28,6 @@ use common_function::scalars::udf::create_udf;
use common_query::{Output, OutputData}; use common_query::{Output, OutputData};
use common_recordbatch::adapter::RecordBatchStreamAdapter; use common_recordbatch::adapter::RecordBatchStreamAdapter;
use common_recordbatch::util; use common_recordbatch::util;
use common_telemetry::warn;
use datafusion::dataframe::DataFrame; use datafusion::dataframe::DataFrame;
use datafusion::execution::SessionStateBuilder; use datafusion::execution::SessionStateBuilder;
use datafusion::execution::context::SessionContext; use datafusion::execution::context::SessionContext;
@@ -43,9 +42,8 @@ use servers::error::{
}; };
use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams}; use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams};
use servers::otlp::trace::{ use servers::otlp::trace::{
DURATION_NANO_COLUMN, KEY_OTEL_STATUS_ERROR_KEY, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, DURATION_NANO_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_KIND_COLUMN,
SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_ERROR, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, TIMESTAMP_COLUMN, TRACE_ID_COLUMN,
TIMESTAMP_COLUMN, TRACE_ID_COLUMN,
}; };
use servers::query_handler::JaegerQueryHandler; use servers::query_handler::JaegerQueryHandler;
use session::context::QueryContextRef; use session::context::QueryContextRef;
@@ -265,7 +263,7 @@ impl JaegerQueryHandler for Instance {
self.query_engine(), self.query_engine(),
vec![wildcard()], vec![wildcard()],
filters, filters,
vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order. vec![],
None, None,
None, None,
vec![], vec![],
@@ -324,7 +322,6 @@ async fn query_trace_table(
})?; })?;
let is_data_model_v1 = table let is_data_model_v1 = table
.clone()
.table_info() .table_info()
.meta .meta
.options .options
@@ -333,14 +330,6 @@ async fn query_trace_table(
.map(|s| s.as_str()) .map(|s| s.as_str())
== Some(TABLE_DATA_MODEL_TRACE_V1); == Some(TABLE_DATA_MODEL_TRACE_V1);
// collect to set
let col_names = table
.table_info()
.meta
.field_column_names()
.map(|s| format!("\"{}\"", s))
.collect::<HashSet<String>>();
let df_context = create_df_context(query_engine)?; let df_context = create_df_context(query_engine)?;
let dataframe = df_context let dataframe = df_context
@@ -353,7 +342,7 @@ async fn query_trace_table(
let dataframe = filters let dataframe = filters
.into_iter() .into_iter()
.chain(tags.map_or(Ok(vec![]), |t| { .chain(tags.map_or(Ok(vec![]), |t| {
tags_filters(&dataframe, t, is_data_model_v1, &col_names) tags_filters(&dataframe, t, is_data_model_v1)
})?) })?)
.try_fold(dataframe, |df, expr| { .try_fold(dataframe, |df, expr| {
df.filter(expr).context(DataFusionSnafu) df.filter(expr).context(DataFusionSnafu)
@@ -483,73 +472,23 @@ fn json_tag_filters(
Ok(filters) Ok(filters)
} }
/// Helper function to check if span_key or resource_key exists in col_names and create an expression. fn flatten_tag_filters(tags: HashMap<String, JsonValue>) -> ServerResult<Vec<Expr>> {
/// If neither exists, logs a warning and returns None.
#[inline]
fn check_col_and_build_expr<F>(
span_key: String,
resource_key: String,
key: &str,
col_names: &HashSet<String>,
expr_builder: F,
) -> Option<Expr>
where
F: FnOnce(String) -> Expr,
{
if col_names.contains(&span_key) {
return Some(expr_builder(span_key));
}
if col_names.contains(&resource_key) {
return Some(expr_builder(resource_key));
}
warn!("tag key {} not found in table columns", key);
None
}
fn flatten_tag_filters(
tags: HashMap<String, JsonValue>,
col_names: &HashSet<String>,
) -> ServerResult<Vec<Expr>> {
let filters = tags let filters = tags
.into_iter() .into_iter()
.filter_map(|(key, value)| { .filter_map(|(key, value)| {
if key == KEY_OTEL_STATUS_ERROR_KEY && value == JsonValue::Bool(true) { let key = format!("\"span_attributes.{}\"", key);
return Some(col(SPAN_STATUS_CODE).eq(lit(SPAN_STATUS_ERROR)));
}
// TODO(shuiyisong): add more precise mapping from key to col name
let span_key = format!("\"span_attributes.{}\"", key);
let resource_key = format!("\"resource_attributes.{}\"", key);
match value { match value {
JsonValue::String(value) => { JsonValue::String(value) => Some(col(key).eq(lit(value))),
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).eq(lit(value))
})
}
JsonValue::Number(value) => { JsonValue::Number(value) => {
if value.is_f64() { if value.is_f64() {
// safe to unwrap as checked previously // safe to unwrap as checked previously
let value = value.as_f64().unwrap(); Some(col(key).eq(lit(value.as_f64().unwrap())))
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).eq(lit(value))
})
} else { } else {
let value = value.as_i64().unwrap(); Some(col(key).eq(lit(value.as_i64().unwrap())))
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).eq(lit(value))
})
} }
} }
JsonValue::Bool(value) => { JsonValue::Bool(value) => Some(col(key).eq(lit(value))),
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| { JsonValue::Null => Some(col(key).is_null()),
col(k).eq(lit(value))
})
}
JsonValue::Null => {
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).is_null()
})
}
// not supported at the moment // not supported at the moment
JsonValue::Array(_value) => None, JsonValue::Array(_value) => None,
JsonValue::Object(_value) => None, JsonValue::Object(_value) => None,
@@ -563,10 +502,9 @@ fn tags_filters(
dataframe: &DataFrame, dataframe: &DataFrame,
tags: HashMap<String, JsonValue>, tags: HashMap<String, JsonValue>,
is_data_model_v1: bool, is_data_model_v1: bool,
col_names: &HashSet<String>,
) -> ServerResult<Vec<Expr>> { ) -> ServerResult<Vec<Expr>> {
if is_data_model_v1 { if is_data_model_v1 {
flatten_tag_filters(tags, col_names) flatten_tag_filters(tags)
} else { } else {
json_tag_filters(dataframe, tags) json_tag_filters(dataframe, tags)
} }

View File

@@ -36,7 +36,7 @@ async fn run() {
.timeout(Duration::from_secs(3)) .timeout(Duration::from_secs(3))
.connect_timeout(Duration::from_secs(5)) .connect_timeout(Duration::from_secs(5))
.tcp_nodelay(true); .tcp_nodelay(true);
let channel_manager = ChannelManager::with_config(config, None); let channel_manager = ChannelManager::with_config(config);
let mut meta_client = MetaClientBuilder::datanode_default_options(id) let mut meta_client = MetaClientBuilder::datanode_default_options(id)
.channel_manager(channel_manager) .channel_manager(channel_manager)
.build(); .build();

View File

@@ -101,7 +101,7 @@ pub async fn create_meta_client(
if let MetaClientType::Frontend = client_type { if let MetaClientType::Frontend = client_type {
let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout); let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config, None)); builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config));
if let Some(plugins) = plugins { if let Some(plugins) = plugins {
let region_follower = plugins.get::<RegionFollowerClientRef>(); let region_follower = plugins.get::<RegionFollowerClientRef>();
if let Some(region_follower) = region_follower { if let Some(region_follower) = region_follower {
@@ -112,8 +112,8 @@ pub async fn create_meta_client(
} }
builder = builder builder = builder
.channel_manager(ChannelManager::with_config(base_config, None)) .channel_manager(ChannelManager::with_config(base_config))
.heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config, None)); .heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config));
let mut meta_client = builder.build(); let mut meta_client = builder.build();

View File

@@ -72,10 +72,7 @@ serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
servers.workspace = true servers.workspace = true
snafu.workspace = true snafu.workspace = true
sqlx = { workspace = true, features = [ sqlx = { workspace = true, optional = true }
"mysql",
"chrono",
], optional = true }
store-api.workspace = true store-api.workspace = true
strum.workspace = true strum.workspace = true
table.workspace = true table.workspace = true

View File

@@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration; use std::time::Duration;
use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
use common_telemetry::{error, info, warn}; use common_telemetry::{error, warn};
use common_time::Timestamp; use common_time::Timestamp;
use snafu::{OptionExt, ResultExt, ensure}; use snafu::{OptionExt, ResultExt, ensure};
use sqlx::mysql::{MySqlArguments, MySqlRow}; use sqlx::mysql::{MySqlArguments, MySqlRow};
@@ -645,13 +645,6 @@ impl Election for MySqlElection {
} }
async fn reset_campaign(&self) { async fn reset_campaign(&self) {
info!("Resetting campaign");
if self.is_leader.load(Ordering::Relaxed) {
if let Err(err) = self.step_down_without_lock().await {
error!(err; "Failed to step down without lock");
}
info!("Step down without lock successfully, due to reset campaign");
}
if let Err(err) = self.client.lock().await.reset_client().await { if let Err(err) = self.client.lock().await.reset_client().await {
error!(err; "Failed to reset client"); error!(err; "Failed to reset client");
} }

View File

@@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration; use std::time::Duration;
use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
use common_telemetry::{error, info, warn}; use common_telemetry::{error, warn};
use common_time::Timestamp; use common_time::Timestamp;
use deadpool_postgres::{Manager, Pool}; use deadpool_postgres::{Manager, Pool};
use snafu::{OptionExt, ResultExt, ensure}; use snafu::{OptionExt, ResultExt, ensure};
@@ -477,13 +477,6 @@ impl Election for PgElection {
} }
async fn reset_campaign(&self) { async fn reset_campaign(&self) {
info!("Resetting campaign");
if self.is_leader.load(Ordering::Relaxed) {
if let Err(err) = self.step_down_without_lock().await {
error!(err; "Failed to step down without lock");
}
info!("Step down without lock successfully, due to reset campaign");
}
if let Err(err) = self.pg_client.write().await.reset_client().await { if let Err(err) = self.pg_client.write().await.reset_client().await {
error!(err; "Failed to reset client"); error!(err; "Failed to reset client");
} }
@@ -781,12 +774,16 @@ impl PgElection {
key: key.clone(), key: key.clone(),
..Default::default() ..Default::default()
}; };
send_leader_change_and_set_flags( if self
&self.is_leader, .is_leader
&self.leader_infancy, .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
&self.leader_watcher, .is_ok()
LeaderChangeMessage::StepDown(Arc::new(leader_key)), && let Err(e) = self
); .leader_watcher
.send(LeaderChangeMessage::StepDown(Arc::new(leader_key)))
{
error!(e; "Failed to send leader change message");
}
Ok(()) Ok(())
} }

View File

@@ -19,7 +19,6 @@ use api::v1::meta::{HeartbeatRequest, RegionLease, Role};
use async_trait::async_trait; use async_trait::async_trait;
use common_meta::key::TableMetadataManagerRef; use common_meta::key::TableMetadataManagerRef;
use common_meta::region_keeper::MemoryRegionKeeperRef; use common_meta::region_keeper::MemoryRegionKeeperRef;
use common_telemetry::error;
use store_api::region_engine::GrantedRegion; use store_api::region_engine::GrantedRegion;
use store_api::storage::RegionId; use store_api::storage::RegionId;
@@ -84,44 +83,36 @@ impl HeartbeatHandler for RegionLeaseHandler {
let regions = stat.regions(); let regions = stat.regions();
let datanode_id = stat.id; let datanode_id = stat.id;
match self let RenewRegionLeasesResponse {
non_exists,
renewed,
} = self
.region_lease_keeper .region_lease_keeper
.renew_region_leases(datanode_id, &regions) .renew_region_leases(datanode_id, &regions)
.await .await?;
{
Ok(RenewRegionLeasesResponse {
non_exists,
renewed,
}) => {
let renewed = if let Some(renewer) = &self.customized_region_lease_renewer {
renewer
.renew(ctx, renewed)
.into_iter()
.map(|region| region.into())
.collect()
} else {
renewed
.into_iter()
.map(|(region_id, region_lease_info)| {
GrantedRegion::new(region_id, region_lease_info.role).into()
})
.collect::<Vec<_>>()
};
acc.region_lease = Some(RegionLease { let renewed = if let Some(renewer) = &self.customized_region_lease_renewer {
regions: renewed, renewer
duration_since_epoch: req.duration_since_epoch, .renew(ctx, renewed)
lease_seconds: self.region_lease_seconds, .into_iter()
closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(), .map(|region| region.into())
}); .collect()
acc.inactive_region_ids = non_exists; } else {
} renewed
Err(e) => { .into_iter()
error!(e; "Failed to renew region leases for datanode: {datanode_id:?}, regions: {:?}", regions); .map(|(region_id, region_lease_info)| {
// If we throw error here, the datanode will be marked as failure by region failure handler. GrantedRegion::new(region_id, region_lease_info.role).into()
// So we only log the error and continue. })
} .collect::<Vec<_>>()
} };
acc.region_lease = Some(RegionLease {
regions: renewed,
duration_since_epoch: req.duration_since_epoch,
lease_seconds: self.region_lease_seconds,
closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(),
});
acc.inactive_region_ids = non_exists;
Ok(HandleControl::Continue) Ok(HandleControl::Continue)
} }

View File

@@ -375,14 +375,12 @@ pub struct MetasrvNodeInfo {
// The node total cpu millicores // The node total cpu millicores
#[serde(default)] #[serde(default)]
pub total_cpu_millicores: i64, pub total_cpu_millicores: i64,
// The node total memory bytes
#[serde(default)] #[serde(default)]
// The node total memory bytes
pub total_memory_bytes: i64, pub total_memory_bytes: i64,
/// The node build cpu usage millicores /// The node build cpu usage millicores
#[serde(default)]
pub cpu_usage_millicores: i64, pub cpu_usage_millicores: i64,
/// The node build memory usage bytes /// The node build memory usage bytes
#[serde(default)]
pub memory_usage_bytes: i64, pub memory_usage_bytes: i64,
// The node hostname // The node hostname
#[serde(default)] #[serde(default)]
@@ -860,18 +858,3 @@ impl Metasrv {
} }
} }
} }
#[cfg(test)]
mod tests {
use crate::metasrv::MetasrvNodeInfo;
#[test]
fn test_deserialize_metasrv_node_info() {
let str = r#"{"addr":"127.0.0.1:4002","version":"0.1.0","git_commit":"1234567890","start_time_ms":1715145600}"#;
let node_info: MetasrvNodeInfo = serde_json::from_str(str).unwrap();
assert_eq!(node_info.addr, "127.0.0.1:4002");
assert_eq!(node_info.version, "0.1.0");
assert_eq!(node_info.git_commit, "1234567890");
assert_eq!(node_info.start_time_ms, 1715145600);
}
}

View File

@@ -373,8 +373,7 @@ impl MetasrvBuilder {
runtime_switch_manager.clone(), runtime_switch_manager.clone(),
meta_peer_client.clone(), meta_peer_client.clone(),
leader_cached_kv_backend.clone(), leader_cached_kv_backend.clone(),
) );
.with_state(state.clone());
Some(RegionFailureHandler::new( Some(RegionFailureHandler::new(
region_supervisor, region_supervisor,

View File

@@ -134,7 +134,7 @@ pub async fn mock(
.timeout(Duration::from_secs(10)) .timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(10)) .connect_timeout(Duration::from_secs(10))
.tcp_nodelay(true); .tcp_nodelay(true);
let channel_manager = ChannelManager::with_config(config, None); let channel_manager = ChannelManager::with_config(config);
// Move client to an option so we can _move_ the inner value // Move client to an option so we can _move_ the inner value
// on the first attempt to connect. All other attempts will fail. // on the first attempt to connect. All other attempts will fail.

View File

@@ -41,7 +41,7 @@ use common_meta::key::table_route::TableRouteValue;
use common_meta::key::topic_region::{ReplayCheckpoint, TopicRegionKey}; use common_meta::key::topic_region::{ReplayCheckpoint, TopicRegionKey};
use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef}; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef};
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock}; use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock};
use common_meta::peer::Peer; use common_meta::peer::Peer;
use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard}; use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard};
use common_procedure::error::{ use common_procedure::error::{
@@ -231,6 +231,8 @@ pub struct VolatileContext {
/// the corresponding [RegionRoute](common_meta::rpc::router::RegionRoute) of the opening region /// the corresponding [RegionRoute](common_meta::rpc::router::RegionRoute) of the opening region
/// was written into [TableRouteValue](common_meta::key::table_route::TableRouteValue). /// was written into [TableRouteValue](common_meta::key::table_route::TableRouteValue).
opening_region_guard: Option<OperatingRegionGuard>, opening_region_guard: Option<OperatingRegionGuard>,
/// `table_route` is stored via previous steps for future use.
table_route: Option<DeserializedValueWithBytes<TableRouteValue>>,
/// `datanode_table` is stored via previous steps for future use. /// `datanode_table` is stored via previous steps for future use.
from_peer_datanode_table: Option<DatanodeTableValue>, from_peer_datanode_table: Option<DatanodeTableValue>,
/// `table_info` is stored via previous steps for future use. /// `table_info` is stored via previous steps for future use.
@@ -397,23 +399,29 @@ impl Context {
/// Retry: /// Retry:
/// - Failed to retrieve the metadata of table. /// - Failed to retrieve the metadata of table.
pub async fn get_table_route_value( pub async fn get_table_route_value(
&self, &mut self,
) -> Result<DeserializedValueWithBytes<TableRouteValue>> { ) -> Result<&DeserializedValueWithBytes<TableRouteValue>> {
let table_id = self.persistent_ctx.region_id.table_id(); let table_route_value = &mut self.volatile_ctx.table_route;
let table_route = self
.table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get TableRoute: {table_id}"),
})?
.context(error::TableRouteNotFoundSnafu { table_id })?;
Ok(table_route) if table_route_value.is_none() {
let table_id = self.persistent_ctx.region_id.table_id();
let table_route = self
.table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get TableRoute: {table_id}"),
})?
.context(error::TableRouteNotFoundSnafu { table_id })?;
*table_route_value = Some(table_route);
}
Ok(table_route_value.as_ref().unwrap())
} }
/// Notifies the RegionSupervisor to register failure detectors of failed region. /// Notifies the RegionSupervisor to register failure detectors of failed region.
@@ -455,6 +463,12 @@ impl Context {
.await; .await;
} }
/// Removes the `table_route` of [VolatileContext], returns true if any.
pub fn remove_table_route_value(&mut self) -> bool {
let value = self.volatile_ctx.table_route.take();
value.is_some()
}
/// Returns the `table_info` of [VolatileContext] if any. /// Returns the `table_info` of [VolatileContext] if any.
/// Otherwise, returns the value retrieved from remote. /// Otherwise, returns the value retrieved from remote.
/// ///
@@ -649,13 +663,14 @@ impl RegionMigrationProcedure {
}) })
} }
async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> { async fn rollback_inner(&mut self) -> Result<()> {
let _timer = METRIC_META_REGION_MIGRATION_EXECUTE let _timer = METRIC_META_REGION_MIGRATION_EXECUTE
.with_label_values(&["rollback"]) .with_label_values(&["rollback"])
.start_timer(); .start_timer();
let table_id = self.context.region_id().table_id(); let table_id = self.context.region_id().table_id();
let region_id = self.context.region_id(); let region_id = self.context.region_id();
self.context.remove_table_route_value();
let table_metadata_manager = self.context.table_metadata_manager.clone(); let table_metadata_manager = self.context.table_metadata_manager.clone();
let table_route = self.context.get_table_route_value().await?; let table_route = self.context.get_table_route_value().await?;
@@ -668,11 +683,9 @@ impl RegionMigrationProcedure {
.any(|route| route.is_leader_downgrading()); .any(|route| route.is_leader_downgrading());
if downgraded { if downgraded {
let table_lock = TableLock::Write(region_id.table_id()).into();
let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
info!("Rollbacking downgraded region leader table route, region: {region_id}"); info!("Rollbacking downgraded region leader table route, region: {region_id}");
table_metadata_manager table_metadata_manager
.update_leader_region_status(table_id, &table_route, |route| { .update_leader_region_status(table_id, table_route, |route| {
if route.region.id == region_id { if route.region.id == region_id {
Some(None) Some(None)
} else { } else {
@@ -685,9 +698,6 @@ impl RegionMigrationProcedure {
.with_context(|_| error::RetryLaterWithSourceSnafu { .with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"), reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
})?; })?;
self.context
.deregister_failure_detectors_for_candidate_region()
.await;
} }
self.context.register_failure_detectors().await; self.context.register_failure_detectors().await;
@@ -702,8 +712,8 @@ impl Procedure for RegionMigrationProcedure {
Self::TYPE_NAME Self::TYPE_NAME
} }
async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> { async fn rollback(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<()> {
self.rollback_inner(ctx) self.rollback_inner()
.await .await
.map_err(ProcedureError::external) .map_err(ProcedureError::external)
} }

View File

@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
use std::any::Any; use std::any::Any;
use std::ops::Div;
use std::time::Duration; use std::time::Duration;
use api::v1::meta::MailboxMessage; use api::v1::meta::MailboxMessage;
@@ -140,15 +139,12 @@ impl OpenCandidateRegion {
input: open_instruction.to_string(), input: open_instruction.to_string(),
})?; })?;
let operation_timeout =
ctx.next_operation_timeout()
.context(error::ExceededDeadlineSnafu {
operation: "Open candidate region",
})?;
let operation_timeout = operation_timeout.div(2).max(OPEN_CANDIDATE_REGION_TIMEOUT);
let ch = Channel::Datanode(candidate.id); let ch = Channel::Datanode(candidate.id);
let now = Instant::now(); let now = Instant::now();
let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?; let receiver = ctx
.mailbox
.send(&ch, msg, OPEN_CANDIDATE_REGION_TIMEOUT)
.await?;
match receiver.await { match receiver.await {
Ok(msg) => { Ok(msg) => {

View File

@@ -46,7 +46,7 @@ impl UpdateMetadata {
// TODO(weny): ensures the leader region peer is the `from_peer`. // TODO(weny): ensures the leader region peer is the `from_peer`.
if let Err(err) = table_metadata_manager if let Err(err) = table_metadata_manager
.update_leader_region_status(table_id, &current_table_route_value, |route| { .update_leader_region_status(table_id, current_table_route_value, |route| {
if route.region.id == region_id if route.region.id == region_id
&& route && route
.leader_peer .leader_peer
@@ -61,6 +61,7 @@ impl UpdateMetadata {
.await .await
.context(error::TableMetadataManagerSnafu) .context(error::TableMetadataManagerSnafu)
{ {
ctx.remove_table_route_value();
return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
reason: format!( reason: format!(
"Failed to update the table route during the downgrading leader region, region_id: {region_id}, from_peer_id: {from_peer_id}" "Failed to update the table route during the downgrading leader region, region_id: {region_id}, from_peer_id: {from_peer_id}"
@@ -68,6 +69,8 @@ impl UpdateMetadata {
}); });
} }
ctx.remove_table_route_value();
Ok(()) Ok(())
} }
} }
@@ -78,7 +81,7 @@ mod tests {
use common_meta::key::test_utils::new_test_table_info; use common_meta::key::test_utils::new_test_table_info;
use common_meta::peer::Peer; use common_meta::peer::Peer;
use common_meta::rpc::router::{Region, RegionRoute}; use common_meta::rpc::router::{LeaderState, Region, RegionRoute};
use store_api::storage::RegionId; use store_api::storage::RegionId;
use crate::error::Error; use crate::error::Error;
@@ -112,6 +115,63 @@ mod tests {
assert!(!err.is_retryable()); assert!(!err.is_retryable());
} }
#[tokio::test]
async fn test_failed_to_update_table_route_error() {
let state = UpdateMetadata::Downgrade;
let persistent_context = new_persistent_context();
let from_peer = persistent_context.from_peer.clone();
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_id = ctx.region_id().table_id();
let table_info = new_test_table_info(1024, vec![1, 2]).into();
let region_routes = vec![
RegionRoute {
region: Region::new_test(RegionId::new(1024, 1)),
leader_peer: Some(from_peer.clone()),
..Default::default()
},
RegionRoute {
region: Region::new_test(RegionId::new(1024, 2)),
leader_peer: Some(Peer::empty(4)),
..Default::default()
},
];
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_metadata_manager = env.table_metadata_manager();
let original_table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.unwrap()
.unwrap();
// modifies the table route.
table_metadata_manager
.update_leader_region_status(table_id, &original_table_route, |route| {
if route.region.id == RegionId::new(1024, 2) {
Some(Some(LeaderState::Downgrading))
} else {
None
}
})
.await
.unwrap();
// sets the old table route.
ctx.volatile_ctx.table_route = Some(original_table_route);
let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(err.is_retryable());
assert!(format!("{err:?}").contains("Failed to update the table route"));
}
#[tokio::test] #[tokio::test]
async fn test_only_downgrade_from_peer() { async fn test_only_downgrade_from_peer() {
let mut state = Box::new(UpdateMetadata::Downgrade); let mut state = Box::new(UpdateMetadata::Downgrade);
@@ -152,6 +212,7 @@ mod tests {
// It should remain unchanged. // It should remain unchanged.
assert_eq!(latest_table_route.version().unwrap(), 0); assert_eq!(latest_table_route.version().unwrap(), 0);
assert!(!latest_table_route.region_routes().unwrap()[0].is_leader_downgrading()); assert!(!latest_table_route.region_routes().unwrap()[0].is_leader_downgrading());
assert!(ctx.volatile_ctx.table_route.is_none());
} }
#[tokio::test] #[tokio::test]
@@ -193,5 +254,6 @@ mod tests {
.unwrap(); .unwrap();
assert!(latest_table_route.region_routes().unwrap()[0].is_leader_downgrading()); assert!(latest_table_route.region_routes().unwrap()[0].is_leader_downgrading());
assert!(ctx.volatile_ctx.table_route.is_none());
} }
} }

View File

@@ -35,7 +35,7 @@ impl UpdateMetadata {
let current_table_route_value = ctx.get_table_route_value().await?; let current_table_route_value = ctx.get_table_route_value().await?;
if let Err(err) = table_metadata_manager if let Err(err) = table_metadata_manager
.update_leader_region_status(table_id, &current_table_route_value, |route| { .update_leader_region_status(table_id, current_table_route_value, |route| {
if route.region.id == region_id { if route.region.id == region_id {
Some(None) Some(None)
} else { } else {
@@ -45,12 +45,14 @@ impl UpdateMetadata {
.await .await
.context(error::TableMetadataManagerSnafu) .context(error::TableMetadataManagerSnafu)
{ {
ctx.remove_table_route_value();
return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"), reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
}); });
} }
ctx.register_failure_detectors().await; ctx.register_failure_detectors().await;
ctx.remove_table_route_value();
Ok(()) Ok(())
} }
@@ -59,6 +61,7 @@ impl UpdateMetadata {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::assert_matches::assert_matches; use std::assert_matches::assert_matches;
use std::sync::Arc;
use common_meta::key::test_utils::new_test_table_info; use common_meta::key::test_utils::new_test_table_info;
use common_meta::peer::Peer; use common_meta::peer::Peer;
@@ -70,6 +73,7 @@ mod tests {
use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context}; use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context};
use crate::procedure::region_migration::update_metadata::UpdateMetadata; use crate::procedure::region_migration::update_metadata::UpdateMetadata;
use crate::procedure::region_migration::{ContextFactory, PersistentContext, State}; use crate::procedure::region_migration::{ContextFactory, PersistentContext, State};
use crate::region::supervisor::RegionFailureDetectorControl;
fn new_persistent_context() -> PersistentContext { fn new_persistent_context() -> PersistentContext {
test_util::new_persistent_context(1, 2, RegionId::new(1024, 1)) test_util::new_persistent_context(1, 2, RegionId::new(1024, 1))
@@ -89,6 +93,101 @@ mod tests {
assert!(!err.is_retryable()); assert!(!err.is_retryable());
} }
#[tokio::test]
async fn test_update_table_route_with_retry() {
let state = UpdateMetadata::Rollback;
let persistent_context = new_persistent_context();
let from_peer = persistent_context.from_peer.clone();
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let (tx, mut rx) = tokio::sync::mpsc::channel(8);
ctx.region_failure_detector_controller = Arc::new(RegionFailureDetectorControl::new(tx));
let table_id = ctx.region_id().table_id();
let table_info = new_test_table_info(1024, vec![1, 2, 3]).into();
let region_routes = vec![
RegionRoute {
region: Region::new_test(RegionId::new(1024, 1)),
leader_peer: Some(from_peer.clone()),
leader_state: Some(LeaderState::Downgrading),
..Default::default()
},
RegionRoute {
region: Region::new_test(RegionId::new(1024, 2)),
leader_peer: Some(Peer::empty(4)),
leader_state: Some(LeaderState::Downgrading),
..Default::default()
},
RegionRoute {
region: Region::new_test(RegionId::new(1024, 3)),
leader_peer: Some(Peer::empty(5)),
..Default::default()
},
];
let expected_region_routes = {
let mut region_routes = region_routes.clone();
region_routes[0].leader_state = None;
region_routes[1].leader_state = None;
region_routes
};
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_metadata_manager = env.table_metadata_manager();
let old_table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.unwrap()
.unwrap();
// modifies the table route.
table_metadata_manager
.update_leader_region_status(table_id, &old_table_route, |route| {
if route.region.id == RegionId::new(1024, 2) {
Some(None)
} else {
None
}
})
.await
.unwrap();
ctx.volatile_ctx.table_route = Some(old_table_route);
let err = state
.rollback_downgraded_region(&mut ctx)
.await
.unwrap_err();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(err.is_retryable());
assert!(format!("{err:?}").contains("Failed to update the table route"));
assert_eq!(rx.len(), 0);
state.rollback_downgraded_region(&mut ctx).await.unwrap();
let event = rx.try_recv().unwrap();
let detecting_regions = event.into_region_failure_detectors();
assert_eq!(
detecting_regions,
vec![(from_peer.id, ctx.persistent_ctx.region_id)]
);
let table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get(table_id)
.await
.unwrap()
.unwrap();
assert_eq!(
&expected_region_routes,
table_route.region_routes().unwrap()
);
}
#[tokio::test] #[tokio::test]
async fn test_next_migration_end_state() { async fn test_next_migration_end_state() {
let mut state = Box::new(UpdateMetadata::Rollback); let mut state = Box::new(UpdateMetadata::Rollback);
@@ -139,6 +238,8 @@ mod tests {
.downcast_ref::<RegionMigrationAbort>() .downcast_ref::<RegionMigrationAbort>()
.unwrap(); .unwrap();
assert!(ctx.volatile_ctx.table_route.is_none());
let table_route = table_metadata_manager let table_route = table_metadata_manager
.table_route_manager() .table_route_manager()
.table_route_storage() .table_route_storage()

View File

@@ -166,7 +166,7 @@ impl UpdateMetadata {
region_options: region_options.clone(), region_options: region_options.clone(),
region_wal_options: region_wal_options.clone(), region_wal_options: region_wal_options.clone(),
}, },
&table_route_value, table_route_value,
region_routes, region_routes,
&region_options, &region_options,
&region_wal_options, &region_wal_options,
@@ -174,11 +174,13 @@ impl UpdateMetadata {
.await .await
.context(error::TableMetadataManagerSnafu) .context(error::TableMetadataManagerSnafu)
{ {
ctx.remove_table_route_value();
return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the upgrading candidate region: {region_id}"), reason: format!("Failed to update the table route during the upgrading candidate region: {region_id}"),
}); });
}; };
ctx.remove_table_route_value();
ctx.deregister_failure_detectors().await; ctx.deregister_failure_detectors().await;
// Consumes the guard. // Consumes the guard.
ctx.volatile_ctx.opening_region_guard.take(); ctx.volatile_ctx.opening_region_guard.take();
@@ -308,6 +310,71 @@ mod tests {
assert_eq!(new_region_routes[0].leader_peer.as_ref().unwrap().id, 2); assert_eq!(new_region_routes[0].leader_peer.as_ref().unwrap().id, 2);
} }
#[tokio::test]
async fn test_failed_to_update_table_route_error() {
let state = UpdateMetadata::Upgrade;
let env = TestingEnv::new();
let persistent_context = new_persistent_context();
let mut ctx = env.context_factory().new_context(persistent_context);
let opening_keeper = MemoryRegionKeeper::default();
let table_id = 1024;
let table_info = new_test_table_info(table_id, vec![1]).into();
let region_routes = vec![
RegionRoute {
region: Region::new_test(RegionId::new(table_id, 1)),
leader_peer: Some(Peer::empty(1)),
follower_peers: vec![Peer::empty(5), Peer::empty(3)],
leader_state: Some(LeaderState::Downgrading),
leader_down_since: Some(current_time_millis()),
},
RegionRoute {
region: Region::new_test(RegionId::new(table_id, 2)),
leader_peer: Some(Peer::empty(4)),
leader_state: Some(LeaderState::Downgrading),
..Default::default()
},
];
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_metadata_manager = env.table_metadata_manager();
let original_table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.unwrap()
.unwrap();
// modifies the table route.
table_metadata_manager
.update_leader_region_status(table_id, &original_table_route, |route| {
if route.region.id == RegionId::new(1024, 2) {
// Removes the status.
Some(None)
} else {
None
}
})
.await
.unwrap();
// sets the old table route.
ctx.volatile_ctx.table_route = Some(original_table_route);
let guard = opening_keeper
.register(2, RegionId::new(table_id, 1))
.unwrap();
ctx.volatile_ctx.opening_region_guard = Some(guard);
let err = state.upgrade_candidate_region(&mut ctx).await.unwrap_err();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(ctx.volatile_ctx.opening_region_guard.is_some());
assert!(err.is_retryable());
assert!(format!("{err:?}").contains("Failed to update the table route"));
}
#[tokio::test] #[tokio::test]
async fn test_check_metadata() { async fn test_check_metadata() {
let state = UpdateMetadata::Upgrade; let state = UpdateMetadata::Upgrade;
@@ -425,6 +492,7 @@ mod tests {
.unwrap(); .unwrap();
let region_routes = table_route.region_routes().unwrap(); let region_routes = table_route.region_routes().unwrap();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(ctx.volatile_ctx.opening_region_guard.is_none()); assert!(ctx.volatile_ctx.opening_region_guard.is_none());
assert_eq!(region_routes.len(), 1); assert_eq!(region_routes.len(), 1);
assert!(!region_routes[0].is_leader_downgrading()); assert!(!region_routes[0].is_leader_downgrading());

View File

@@ -17,9 +17,7 @@ use std::time::Duration;
use api::v1::meta::MailboxMessage; use api::v1::meta::MailboxMessage;
use common_meta::ddl::utils::parse_region_wal_options; use common_meta::ddl::utils::parse_region_wal_options;
use common_meta::instruction::{ use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply};
Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply,
};
use common_meta::lock_key::RemoteWalLock; use common_meta::lock_key::RemoteWalLock;
use common_meta::wal_options_allocator::extract_topic_from_wal_options; use common_meta::wal_options_allocator::extract_topic_from_wal_options;
use common_procedure::{Context as ProcedureContext, Status}; use common_procedure::{Context as ProcedureContext, Status};
@@ -133,19 +131,19 @@ impl UpgradeCandidateRegion {
None None
}; };
let upgrade_instruction = Instruction::UpgradeRegions(vec![ let upgrade_instruction = Instruction::UpgradeRegion(
UpgradeRegion { UpgradeRegion {
region_id, region_id,
last_entry_id, last_entry_id,
metadata_last_entry_id, metadata_last_entry_id,
replay_timeout, replay_timeout: Some(replay_timeout),
location_id: Some(ctx.persistent_ctx.from_peer.id), location_id: Some(ctx.persistent_ctx.from_peer.id),
replay_entry_id: None, replay_entry_id: None,
metadata_replay_entry_id: None, metadata_replay_entry_id: None,
} }
.with_replay_entry_id(checkpoint.map(|c| c.entry_id)) .with_replay_entry_id(checkpoint.map(|c| c.entry_id))
.with_metadata_replay_entry_id(checkpoint.and_then(|c| c.metadata_entry_id)), .with_metadata_replay_entry_id(checkpoint.and_then(|c| c.metadata_entry_id)),
]); );
Ok(upgrade_instruction) Ok(upgrade_instruction)
} }
@@ -195,7 +193,11 @@ impl UpgradeCandidateRegion {
match receiver.await { match receiver.await {
Ok(msg) => { Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?; let reply = HeartbeatMailbox::json_reply(&msg)?;
let InstructionReply::UpgradeRegions(UpgradeRegionsReply { replies }) = reply let InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready,
exists,
error,
}) = reply
else { else {
return error::UnexpectedInstructionReplySnafu { return error::UnexpectedInstructionReplySnafu {
mailbox_message: msg.to_string(), mailbox_message: msg.to_string(),
@@ -203,13 +205,6 @@ impl UpgradeCandidateRegion {
} }
.fail(); .fail();
}; };
// TODO(weny): handle multiple replies.
let UpgradeRegionReply {
ready,
exists,
error,
..
} = &replies[0];
// Notes: The order of handling is important. // Notes: The order of handling is important.
if error.is_some() { if error.is_some() {

View File

@@ -18,7 +18,7 @@ use api::v1::meta::mailbox_message::Payload;
use api::v1::meta::{HeartbeatResponse, MailboxMessage}; use api::v1::meta::{HeartbeatResponse, MailboxMessage};
use common_meta::instruction::{ use common_meta::instruction::{
DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply, DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply,
UpgradeRegionReply, UpgradeRegionsReply, UpgradeRegionReply,
}; };
use common_meta::key::TableMetadataManagerRef; use common_meta::key::TableMetadataManagerRef;
use common_meta::key::table_route::TableRouteValue; use common_meta::key::table_route::TableRouteValue;
@@ -212,14 +212,11 @@ pub fn new_upgrade_region_reply(
to: "meta".to_string(), to: "meta".to_string(),
timestamp_millis: current_time_millis(), timestamp_millis: current_time_millis(),
payload: Some(Payload::Json( payload: Some(Payload::Json(
serde_json::to_string(&InstructionReply::UpgradeRegions( serde_json::to_string(&InstructionReply::UpgradeRegion(UpgradeRegionReply {
UpgradeRegionsReply::single(UpgradeRegionReply { ready,
region_id: RegionId::new(0, 0), exists,
ready, error,
exists, }))
error,
}),
))
.unwrap(), .unwrap(),
)), )),
} }

View File

@@ -52,7 +52,6 @@ use crate::procedure::region_migration::{
}; };
use crate::region::failure_detector::RegionFailureDetector; use crate::region::failure_detector::RegionFailureDetector;
use crate::selector::SelectorOptions; use crate::selector::SelectorOptions;
use crate::state::StateRef;
/// `DatanodeHeartbeat` represents the heartbeat signal sent from a datanode. /// `DatanodeHeartbeat` represents the heartbeat signal sent from a datanode.
/// It includes identifiers for the cluster and datanode, a list of regions being monitored, /// It includes identifiers for the cluster and datanode, a list of regions being monitored,
@@ -101,6 +100,16 @@ pub(crate) enum Event {
Dump(tokio::sync::oneshot::Sender<RegionFailureDetector>), Dump(tokio::sync::oneshot::Sender<RegionFailureDetector>),
} }
#[cfg(test)]
impl Event {
pub(crate) fn into_region_failure_detectors(self) -> Vec<DetectingRegion> {
match self {
Self::RegisterFailureDetectors(detecting_regions) => detecting_regions,
_ => unreachable!(),
}
}
}
impl Debug for Event { impl Debug for Event {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
@@ -130,9 +139,6 @@ pub struct RegionSupervisorTicker {
/// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`]. /// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`].
tick_handle: Mutex<Option<JoinHandle<()>>>, tick_handle: Mutex<Option<JoinHandle<()>>>,
/// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`].
initialization_handler: Mutex<Option<JoinHandle<()>>>,
/// The interval of tick. /// The interval of tick.
tick_interval: Duration, tick_interval: Duration,
@@ -176,7 +182,6 @@ impl RegionSupervisorTicker {
); );
Self { Self {
tick_handle: Mutex::new(None), tick_handle: Mutex::new(None),
initialization_handler: Mutex::new(None),
tick_interval, tick_interval,
initialization_delay, initialization_delay,
initialization_retry_period, initialization_retry_period,
@@ -197,7 +202,7 @@ impl RegionSupervisorTicker {
self.initialization_retry_period, self.initialization_retry_period,
); );
initialization_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); initialization_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
let initialization_handler = common_runtime::spawn_global(async move { common_runtime::spawn_global(async move {
loop { loop {
initialization_interval.tick().await; initialization_interval.tick().await;
let (tx, rx) = oneshot::channel(); let (tx, rx) = oneshot::channel();
@@ -213,7 +218,6 @@ impl RegionSupervisorTicker {
} }
} }
}); });
*self.initialization_handler.lock().unwrap() = Some(initialization_handler);
let sender = self.sender.clone(); let sender = self.sender.clone();
let ticker_loop = tokio::spawn(async move { let ticker_loop = tokio::spawn(async move {
@@ -243,11 +247,6 @@ impl RegionSupervisorTicker {
handle.abort(); handle.abort();
info!("The tick loop is stopped."); info!("The tick loop is stopped.");
} }
let initialization_handler = self.initialization_handler.lock().unwrap().take();
if let Some(initialization_handler) = initialization_handler {
initialization_handler.abort();
info!("The initialization loop is stopped.");
}
} }
} }
@@ -291,8 +290,6 @@ pub struct RegionSupervisor {
peer_resolver: PeerResolverRef, peer_resolver: PeerResolverRef,
/// The kv backend. /// The kv backend.
kv_backend: KvBackendRef, kv_backend: KvBackendRef,
/// The meta state, used to check if the current metasrv is the leader.
state: Option<StateRef>,
} }
/// Controller for managing failure detectors for regions. /// Controller for managing failure detectors for regions.
@@ -376,29 +373,12 @@ impl RegionSupervisor {
runtime_switch_manager, runtime_switch_manager,
peer_resolver, peer_resolver,
kv_backend, kv_backend,
state: None,
} }
} }
/// Sets the meta state.
pub(crate) fn with_state(mut self, state: StateRef) -> Self {
self.state = Some(state);
self
}
/// Runs the main loop. /// Runs the main loop.
pub(crate) async fn run(&mut self) { pub(crate) async fn run(&mut self) {
while let Some(event) = self.receiver.recv().await { while let Some(event) = self.receiver.recv().await {
if let Some(state) = self.state.as_ref()
&& !state.read().unwrap().is_leader()
{
warn!(
"The current metasrv is not the leader, ignore {:?} event",
event
);
continue;
}
match event { match event {
Event::InitializeAllRegions(sender) => { Event::InitializeAllRegions(sender) => {
match self.is_maintenance_mode_enabled().await { match self.is_maintenance_mode_enabled().await {
@@ -433,10 +413,7 @@ impl RegionSupervisor {
self.deregister_failure_detectors(detecting_regions).await self.deregister_failure_detectors(detecting_regions).await
} }
Event::HeartbeatArrived(heartbeat) => self.on_heartbeat_arrived(heartbeat), Event::HeartbeatArrived(heartbeat) => self.on_heartbeat_arrived(heartbeat),
Event::Clear => { Event::Clear => self.clear(),
self.clear();
info!("Region supervisor is initialized.");
}
#[cfg(test)] #[cfg(test)]
Event::Dump(sender) => { Event::Dump(sender) => {
let _ = sender.send(self.failure_detector.dump()); let _ = sender.send(self.failure_detector.dump());
@@ -929,7 +906,6 @@ pub(crate) mod tests {
let (tx, mut rx) = tokio::sync::mpsc::channel(128); let (tx, mut rx) = tokio::sync::mpsc::channel(128);
let ticker = RegionSupervisorTicker { let ticker = RegionSupervisorTicker {
tick_handle: Mutex::new(None), tick_handle: Mutex::new(None),
initialization_handler: Mutex::new(None),
tick_interval: Duration::from_millis(10), tick_interval: Duration::from_millis(10),
initialization_delay: Duration::from_millis(100), initialization_delay: Duration::from_millis(100),
initialization_retry_period: Duration::from_millis(100), initialization_retry_period: Duration::from_millis(100),
@@ -956,7 +932,6 @@ pub(crate) mod tests {
let (tx, mut rx) = tokio::sync::mpsc::channel(128); let (tx, mut rx) = tokio::sync::mpsc::channel(128);
let ticker = RegionSupervisorTicker { let ticker = RegionSupervisorTicker {
tick_handle: Mutex::new(None), tick_handle: Mutex::new(None),
initialization_handler: Mutex::new(None),
tick_interval: Duration::from_millis(1000), tick_interval: Duration::from_millis(1000),
initialization_delay: Duration::from_millis(50), initialization_delay: Duration::from_millis(50),
initialization_retry_period: Duration::from_millis(50), initialization_retry_period: Duration::from_millis(50),

View File

@@ -79,7 +79,6 @@ impl heartbeat_server::Heartbeat for Metasrv {
let res = handler_group let res = handler_group
.handle(req, ctx.clone()) .handle(req, ctx.clone())
.await .await
.inspect_err(|e| warn!(e; "Failed to handle heartbeat request, pusher: {pusher_id:?}", ))
.map_err(|e| e.into()); .map_err(|e| e.into());
is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader()); is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader());

View File

@@ -75,12 +75,6 @@ impl State {
}) })
} }
/// Returns true if the current state is a leader.
pub fn is_leader(&self) -> bool {
matches!(self, State::Leader(_))
}
/// Returns true if the leader cache is enabled.
pub fn enable_leader_cache(&self) -> bool { pub fn enable_leader_cache(&self) -> bool {
match &self { match &self {
State::Leader(leader) => leader.enable_leader_cache, State::Leader(leader) => leader.enable_leader_cache,

View File

@@ -46,7 +46,6 @@ tracing.workspace = true
common-meta = { workspace = true, features = ["testing"] } common-meta = { workspace = true, features = ["testing"] }
common-test-util.workspace = true common-test-util.workspace = true
mito2 = { workspace = true, features = ["test"] } mito2 = { workspace = true, features = ["test"] }
common-wal = { workspace = true }
[package.metadata.cargo-udeps.ignore] [package.metadata.cargo-udeps.ignore]
normal = ["aquamarine"] normal = ["aquamarine"]

View File

@@ -23,8 +23,8 @@ pub(crate) const DEFAULT_FLUSH_METADATA_REGION_INTERVAL: Duration = Duration::fr
/// Configuration for the metric engine. /// Configuration for the metric engine.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EngineConfig { pub struct EngineConfig {
/// Whether to use sparse primary key encoding. /// Experimental feature to use sparse primary key encoding.
pub sparse_primary_key_encoding: bool, pub experimental_sparse_primary_key_encoding: bool,
/// The flush interval of the metadata region. /// The flush interval of the metadata region.
#[serde( #[serde(
with = "humantime_serde", with = "humantime_serde",
@@ -37,7 +37,7 @@ impl Default for EngineConfig {
fn default() -> Self { fn default() -> Self {
Self { Self {
flush_metadata_region_interval: DEFAULT_FLUSH_METADATA_REGION_INTERVAL, flush_metadata_region_interval: DEFAULT_FLUSH_METADATA_REGION_INTERVAL,
sparse_primary_key_encoding: true, experimental_sparse_primary_key_encoding: false,
} }
} }
} }

View File

@@ -20,7 +20,7 @@ use snafu::ResultExt;
use store_api::metadata::ColumnMetadata; use store_api::metadata::ColumnMetadata;
use store_api::region_engine::RegionEngine; use store_api::region_engine::RegionEngine;
use store_api::region_request::{ use store_api::region_request::{
AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionRequest, AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionPutRequest, RegionRequest,
}; };
use store_api::storage::consts::ReservedColumnId; use store_api::storage::consts::ReservedColumnId;
use store_api::storage::{ConcreteDataType, RegionId}; use store_api::storage::{ConcreteDataType, RegionId};
@@ -183,11 +183,11 @@ impl DataRegion {
pub async fn write_data( pub async fn write_data(
&self, &self,
region_id: RegionId, region_id: RegionId,
request: RegionRequest, request: RegionPutRequest,
) -> Result<AffectedRows> { ) -> Result<AffectedRows> {
let region_id = utils::to_data_region_id(region_id); let region_id = utils::to_data_region_id(region_id);
self.mito self.mito
.handle_request(region_id, request) .handle_request(region_id, RegionRequest::Put(request))
.await .await
.context(MitoWriteOperationSnafu) .context(MitoWriteOperationSnafu)
.map(|result| result.affected_rows) .map(|result| result.affected_rows)

View File

@@ -37,7 +37,7 @@ use common_error::status_code::StatusCode;
use common_runtime::RepeatedTask; use common_runtime::RepeatedTask;
use mito2::engine::MitoEngine; use mito2::engine::MitoEngine;
pub(crate) use options::IndexOptions; pub(crate) use options::IndexOptions;
use snafu::{OptionExt, ResultExt}; use snafu::ResultExt;
pub(crate) use state::MetricEngineState; pub(crate) use state::MetricEngineState;
use store_api::metadata::RegionMetadataRef; use store_api::metadata::RegionMetadataRef;
use store_api::metric_engine_consts::METRIC_ENGINE_NAME; use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
@@ -46,9 +46,7 @@ use store_api::region_engine::{
RegionStatistic, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, RegionStatistic, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
SettableRegionRoleState, SyncManifestResponse, SettableRegionRoleState, SyncManifestResponse,
}; };
use store_api::region_request::{ use store_api::region_request::{BatchRegionDdlRequest, RegionOpenRequest, RegionRequest};
BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber}; use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
use crate::config::EngineConfig; use crate::config::EngineConfig;
@@ -144,17 +142,6 @@ impl RegionEngine for MetricEngine {
.map_err(BoxedError::new) .map_err(BoxedError::new)
} }
async fn handle_batch_catchup_requests(
&self,
parallelism: usize,
requests: Vec<(RegionId, RegionCatchupRequest)>,
) -> Result<BatchResponses, BoxedError> {
self.inner
.handle_batch_catchup_requests(parallelism, requests)
.await
.map_err(BoxedError::new)
}
async fn handle_batch_ddl_requests( async fn handle_batch_ddl_requests(
&self, &self,
batch_request: BatchRegionDdlRequest, batch_request: BatchRegionDdlRequest,
@@ -248,26 +235,19 @@ impl RegionEngine for MetricEngine {
} }
} }
RegionRequest::Truncate(_) => UnsupportedRegionRequestSnafu { request }.fail(), RegionRequest::Truncate(_) => UnsupportedRegionRequestSnafu { request }.fail(),
RegionRequest::Delete(delete) => self.inner.delete_region(region_id, delete).await, RegionRequest::Delete(_) => {
RegionRequest::Catchup(_) => { if self.inner.is_physical_region(region_id) {
let mut response = self self.inner
.inner .mito
.handle_batch_catchup_requests( .handle_request(region_id, request)
1, .await
vec![(region_id, RegionCatchupRequest::default())], .context(error::MitoDeleteOperationSnafu)
) .map(|response| response.affected_rows)
.await } else {
.map_err(BoxedError::new)?; UnsupportedRegionRequestSnafu { request }.fail()
debug_assert_eq!(response.len(), 1); }
let (resp_region_id, response) = response
.pop()
.context(error::UnexpectedRequestSnafu {
reason: "expected 1 response, but got zero responses",
})
.map_err(BoxedError::new)?;
debug_assert_eq!(region_id, resp_region_id);
return response;
} }
RegionRequest::Catchup(req) => self.inner.catchup_region(region_id, req).await,
RegionRequest::BulkInserts(_) => { RegionRequest::BulkInserts(_) => {
// todo(hl): find a way to support bulk inserts in metric engine. // todo(hl): find a way to support bulk inserts in metric engine.
UnsupportedRegionRequestSnafu { request }.fail() UnsupportedRegionRequestSnafu { request }.fail()
@@ -516,17 +496,13 @@ mod test {
use std::collections::HashMap; use std::collections::HashMap;
use common_telemetry::info; use common_telemetry::info;
use common_wal::options::{KafkaWalOptions, WalOptions};
use mito2::sst::location::region_dir_from_table_dir; use mito2::sst::location::region_dir_from_table_dir;
use mito2::test_util::{kafka_log_store_factory, prepare_test_for_kafka_log_store};
use store_api::metric_engine_consts::PHYSICAL_TABLE_METADATA_KEY; use store_api::metric_engine_consts::PHYSICAL_TABLE_METADATA_KEY;
use store_api::mito_engine_options::WAL_OPTIONS_KEY;
use store_api::region_request::{ use store_api::region_request::{
PathType, RegionCloseRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest, PathType, RegionCloseRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest,
}; };
use super::*; use super::*;
use crate::maybe_skip_kafka_log_store_integration_test;
use crate::test_util::TestEnv; use crate::test_util::TestEnv;
#[tokio::test] #[tokio::test]
@@ -707,128 +683,4 @@ mod test {
.unwrap_err(); .unwrap_err();
assert_eq!(err.status_code(), StatusCode::RegionNotFound); assert_eq!(err.status_code(), StatusCode::RegionNotFound);
} }
#[tokio::test]
async fn test_catchup_regions() {
common_telemetry::init_default_ut_logging();
maybe_skip_kafka_log_store_integration_test!();
let kafka_log_store_factory = kafka_log_store_factory().unwrap();
let mito_env = mito2::test_util::TestEnv::new()
.await
.with_log_store_factory(kafka_log_store_factory.clone());
let env = TestEnv::with_mito_env(mito_env).await;
let table_dir = |region_id| format!("table/{region_id}");
let mut physical_region_ids = vec![];
let mut logical_region_ids = vec![];
let num_topics = 3;
let num_physical_regions = 8;
let num_logical_regions = 16;
let parallelism = 2;
let mut topics = Vec::with_capacity(num_topics);
for _ in 0..num_topics {
let topic = prepare_test_for_kafka_log_store(&kafka_log_store_factory)
.await
.unwrap();
topics.push(topic);
}
let topic_idx = |id| (id as usize) % num_topics;
// Creates physical regions
for i in 0..num_physical_regions {
let physical_region_id = RegionId::new(1, i);
physical_region_ids.push(physical_region_id);
let wal_options = WalOptions::Kafka(KafkaWalOptions {
topic: topics[topic_idx(i)].clone(),
});
env.create_physical_region(
physical_region_id,
&table_dir(physical_region_id),
vec![(
WAL_OPTIONS_KEY.to_string(),
serde_json::to_string(&wal_options).unwrap(),
)],
)
.await;
// Creates logical regions for each physical region
for j in 0..num_logical_regions {
let logical_region_id = RegionId::new(1024 + i, j);
logical_region_ids.push(logical_region_id);
env.create_logical_region(physical_region_id, logical_region_id)
.await;
}
}
let metric_engine = env.metric();
// Closes all regions
for region_id in logical_region_ids.iter().chain(physical_region_ids.iter()) {
metric_engine
.handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {}))
.await
.unwrap();
}
// Opens all regions and skip the wal
let requests = physical_region_ids
.iter()
.enumerate()
.map(|(idx, region_id)| {
let mut options = HashMap::new();
let wal_options = WalOptions::Kafka(KafkaWalOptions {
topic: topics[topic_idx(idx as u32)].clone(),
});
options.insert(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new());
options.insert(
WAL_OPTIONS_KEY.to_string(),
serde_json::to_string(&wal_options).unwrap(),
);
(
*region_id,
RegionOpenRequest {
engine: METRIC_ENGINE_NAME.to_string(),
table_dir: table_dir(*region_id),
path_type: PathType::Bare,
options: options.clone(),
skip_wal_replay: true,
checkpoint: None,
},
)
})
.collect::<Vec<_>>();
info!("Open batch regions with parallelism: {parallelism}");
metric_engine
.handle_batch_open_requests(parallelism, requests)
.await
.unwrap();
{
let state = metric_engine.inner.state.read().unwrap();
for logical_region in &logical_region_ids {
assert!(!state.logical_regions().contains_key(logical_region));
}
}
let catch_requests = physical_region_ids
.iter()
.map(|region_id| {
(
*region_id,
RegionCatchupRequest {
set_writable: true,
..Default::default()
},
)
})
.collect::<Vec<_>>();
metric_engine
.handle_batch_catchup_requests(parallelism, catch_requests)
.await
.unwrap();
{
let state = metric_engine.inner.state.read().unwrap();
for logical_region in &logical_region_ids {
assert!(state.logical_regions().contains_key(logical_region));
}
}
}
} }

View File

@@ -324,9 +324,9 @@ mod test {
let physical_region_id2 = RegionId::new(1024, 1); let physical_region_id2 = RegionId::new(1024, 1);
let logical_region_id1 = RegionId::new(1025, 0); let logical_region_id1 = RegionId::new(1025, 0);
let logical_region_id2 = RegionId::new(1025, 1); let logical_region_id2 = RegionId::new(1025, 1);
env.create_physical_region(physical_region_id1, "/test_dir1", vec![]) env.create_physical_region(physical_region_id1, "/test_dir1")
.await; .await;
env.create_physical_region(physical_region_id2, "/test_dir2", vec![]) env.create_physical_region(physical_region_id2, "/test_dir2")
.await; .await;
let region_create_request1 = crate::test_util::create_logical_region_request( let region_create_request1 = crate::test_util::create_logical_region_request(

View File

@@ -12,45 +12,51 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use std::collections::HashMap; use common_telemetry::debug;
use common_error::ext::BoxedError;
use snafu::{OptionExt, ResultExt}; use snafu::{OptionExt, ResultExt};
use store_api::region_engine::{BatchResponses, RegionEngine}; use store_api::region_engine::RegionEngine;
use store_api::region_request::{RegionCatchupRequest, ReplayCheckpoint}; use store_api::region_request::{
AffectedRows, RegionCatchupRequest, RegionRequest, ReplayCheckpoint,
};
use store_api::storage::RegionId; use store_api::storage::RegionId;
use crate::engine::MetricEngineInner; use crate::engine::MetricEngineInner;
use crate::error::{BatchCatchupMitoRegionSnafu, PhysicalRegionNotFoundSnafu, Result}; use crate::error::{
MitoCatchupOperationSnafu, PhysicalRegionNotFoundSnafu, Result, UnsupportedRegionRequestSnafu,
};
use crate::utils; use crate::utils;
impl MetricEngineInner { impl MetricEngineInner {
pub async fn handle_batch_catchup_requests( pub async fn catchup_region(
&self, &self,
parallelism: usize, region_id: RegionId,
requests: Vec<(RegionId, RegionCatchupRequest)>, req: RegionCatchupRequest,
) -> Result<BatchResponses> { ) -> Result<AffectedRows> {
let mut all_requests = Vec::with_capacity(requests.len() * 2); if !self.is_physical_region(region_id) {
let mut physical_region_options_list = Vec::with_capacity(requests.len()); return UnsupportedRegionRequestSnafu {
request: RegionRequest::Catchup(req),
}
.fail();
}
let data_region_id = utils::to_data_region_id(region_id);
let physical_region_options = *self
.state
.read()
.unwrap()
.physical_region_states()
.get(&data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
})?
.options();
for (region_id, req) in requests { let metadata_region_id = utils::to_metadata_region_id(region_id);
let metadata_region_id = utils::to_metadata_region_id(region_id); // TODO(weny): improve the catchup, we can read the wal entries only once.
let data_region_id = utils::to_data_region_id(region_id); debug!("Catchup metadata region {metadata_region_id}");
self.mito
let physical_region_options = *self .handle_request(
.state
.read()
.unwrap()
.physical_region_states()
.get(&data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
})?
.options();
physical_region_options_list.push((data_region_id, physical_region_options));
all_requests.push((
metadata_region_id, metadata_region_id,
RegionCatchupRequest { RegionRequest::Catchup(RegionCatchupRequest {
set_writable: req.set_writable, set_writable: req.set_writable,
entry_id: req.metadata_entry_id, entry_id: req.metadata_entry_id,
metadata_entry_id: None, metadata_entry_id: None,
@@ -59,11 +65,16 @@ impl MetricEngineInner {
entry_id: c.metadata_entry_id.unwrap_or_default(), entry_id: c.metadata_entry_id.unwrap_or_default(),
metadata_entry_id: None, metadata_entry_id: None,
}), }),
}, }),
)); )
all_requests.push(( .await
.context(MitoCatchupOperationSnafu)?;
debug!("Catchup data region {data_region_id}");
self.mito
.handle_request(
data_region_id, data_region_id,
RegionCatchupRequest { RegionRequest::Catchup(RegionCatchupRequest {
set_writable: req.set_writable, set_writable: req.set_writable,
entry_id: req.entry_id, entry_id: req.entry_id,
metadata_entry_id: None, metadata_entry_id: None,
@@ -72,45 +83,14 @@ impl MetricEngineInner {
entry_id: c.entry_id, entry_id: c.entry_id,
metadata_entry_id: None, metadata_entry_id: None,
}), }),
}, }),
)); )
}
let mut results = self
.mito
.handle_batch_catchup_requests(parallelism, all_requests)
.await .await
.context(BatchCatchupMitoRegionSnafu {})? .context(MitoCatchupOperationSnafu)
.into_iter() .map(|response| response.affected_rows)?;
.collect::<HashMap<_, _>>();
let mut responses = Vec::with_capacity(physical_region_options_list.len()); self.recover_states(region_id, physical_region_options)
for (physical_region_id, physical_region_options) in physical_region_options_list { .await?;
let metadata_region_id = utils::to_metadata_region_id(physical_region_id); Ok(0)
let data_region_id = utils::to_data_region_id(physical_region_id);
let metadata_region_result = results.remove(&metadata_region_id);
let data_region_result = results.remove(&data_region_id);
// Pass the optional `metadata_region_result` and `data_region_result` to
// `recover_physical_region_with_results`. This function handles errors for each
// catchup physical region request, allowing the process to continue with the
// remaining regions even if some requests fail.
let response = self
.recover_physical_region_with_results(
metadata_region_result,
data_region_result,
physical_region_id,
physical_region_options,
// Note: We intentionally dont close the region if recovery fails.
// Closing it here might confuse the region server since it links RegionIds to Engines.
// If recovery didnt succeed, the region should stay open.
false,
)
.await
.map_err(BoxedError::new);
responses.push((physical_region_id, response));
}
Ok(responses)
} }
} }

View File

@@ -528,7 +528,7 @@ impl MetricEngineInner {
// set data region options // set data region options
set_data_region_options( set_data_region_options(
&mut data_region_request.options, &mut data_region_request.options,
self.config.sparse_primary_key_encoding, self.config.experimental_sparse_primary_key_encoding,
); );
data_region_request data_region_request
@@ -828,9 +828,9 @@ mod test {
let physical_region_id2 = RegionId::new(1024, 1); let physical_region_id2 = RegionId::new(1024, 1);
let logical_region_id1 = RegionId::new(1025, 0); let logical_region_id1 = RegionId::new(1025, 0);
let logical_region_id2 = RegionId::new(1025, 1); let logical_region_id2 = RegionId::new(1025, 1);
env.create_physical_region(physical_region_id1, "/test_dir1", vec![]) env.create_physical_region(physical_region_id1, "/test_dir1")
.await; .await;
env.create_physical_region(physical_region_id2, "/test_dir2", vec![]) env.create_physical_region(physical_region_id2, "/test_dir2")
.await; .await;
let region_create_request1 = let region_create_request1 =

View File

@@ -76,7 +76,7 @@ mod tests {
]; ];
for (phy_region_id, logi_region_ids) in &phy_to_logi { for (phy_region_id, logi_region_ids) in &phy_to_logi {
env.create_physical_region(*phy_region_id, &TestEnv::default_table_dir(), vec![]) env.create_physical_region(*phy_region_id, &TestEnv::default_table_dir())
.await; .await;
for logi_region_id in logi_region_ids { for logi_region_id in logi_region_ids {
env.create_logical_region(*phy_region_id, *logi_region_id) env.create_logical_region(*phy_region_id, *logi_region_id)
@@ -119,7 +119,6 @@ mod tests {
.index_file_path .index_file_path
.map(|path| path.replace(&e.file_id, "<file_id>")); .map(|path| path.replace(&e.file_id, "<file_id>"));
e.file_id = "<file_id>".to_string(); e.file_id = "<file_id>".to_string();
e.index_file_id = e.index_file_id.map(|_| "<index_file_id>".to_string());
format!("\n{:?}", e) format!("\n{:?}", e)
}) })
.sorted() .sorted()
@@ -128,12 +127,12 @@ mod tests {
assert_eq!( assert_eq!(
debug_format, debug_format,
r#" r#"
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"# ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
); );
// list from storage // list from storage
let storage_entries = mito let storage_entries = mito

View File

@@ -47,7 +47,6 @@ impl MetricEngineInner {
for (region_id, request) in requests { for (region_id, request) in requests {
if !request.is_physical_table() { if !request.is_physical_table() {
warn!("Skipping non-physical table open request: {region_id}");
continue; continue;
} }
let physical_region_options = PhysicalRegionOptions::try_from(&request.options)?; let physical_region_options = PhysicalRegionOptions::try_from(&request.options)?;
@@ -73,19 +72,17 @@ impl MetricEngineInner {
let metadata_region_id = utils::to_metadata_region_id(physical_region_id); let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
let data_region_id = utils::to_data_region_id(physical_region_id); let data_region_id = utils::to_data_region_id(physical_region_id);
let metadata_region_result = results.remove(&metadata_region_id); let metadata_region_result = results.remove(&metadata_region_id);
let data_region_result: Option<std::result::Result<RegionResponse, BoxedError>> = let data_region_result = results.remove(&data_region_id);
results.remove(&data_region_id);
// Pass the optional `metadata_region_result` and `data_region_result` to // Pass the optional `metadata_region_result` and `data_region_result` to
// `recover_physical_region_with_results`. This function handles errors for each // `open_physical_region_with_results`. This function handles errors for each
// open physical region request, allowing the process to continue with the // open physical region request, allowing the process to continue with the
// remaining regions even if some requests fail. // remaining regions even if some requests fail.
let response = self let response = self
.recover_physical_region_with_results( .open_physical_region_with_results(
metadata_region_result, metadata_region_result,
data_region_result, data_region_result,
physical_region_id, physical_region_id,
physical_region_options, physical_region_options,
true,
) )
.await .await
.map_err(BoxedError::new); .map_err(BoxedError::new);
@@ -110,13 +107,12 @@ impl MetricEngineInner {
} }
} }
pub(crate) async fn recover_physical_region_with_results( async fn open_physical_region_with_results(
&self, &self,
metadata_region_result: Option<std::result::Result<RegionResponse, BoxedError>>, metadata_region_result: Option<std::result::Result<RegionResponse, BoxedError>>,
data_region_result: Option<std::result::Result<RegionResponse, BoxedError>>, data_region_result: Option<std::result::Result<RegionResponse, BoxedError>>,
physical_region_id: RegionId, physical_region_id: RegionId,
physical_region_options: PhysicalRegionOptions, physical_region_options: PhysicalRegionOptions,
close_region_on_failure: bool,
) -> Result<RegionResponse> { ) -> Result<RegionResponse> {
let metadata_region_id = utils::to_metadata_region_id(physical_region_id); let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
let data_region_id = utils::to_data_region_id(physical_region_id); let data_region_id = utils::to_data_region_id(physical_region_id);
@@ -140,10 +136,8 @@ impl MetricEngineInner {
.recover_states(physical_region_id, physical_region_options) .recover_states(physical_region_id, physical_region_options)
.await .await
{ {
if close_region_on_failure { self.close_physical_region_on_recovery_failure(physical_region_id)
self.close_physical_region_on_recovery_failure(physical_region_id) .await;
.await;
}
return Err(err); return Err(err);
} }
Ok(data_region_response) Ok(data_region_response)
@@ -227,7 +221,7 @@ impl MetricEngineInner {
let mut data_region_options = request.options; let mut data_region_options = request.options;
set_data_region_options( set_data_region_options(
&mut data_region_options, &mut data_region_options,
self.config.sparse_primary_key_encoding, self.config.experimental_sparse_primary_key_encoding,
); );
let open_data_region_request = RegionOpenRequest { let open_data_region_request = RegionOpenRequest {
table_dir: request.table_dir.clone(), table_dir: request.table_dir.clone(),

View File

@@ -17,12 +17,12 @@
use std::collections::HashMap; use std::collections::HashMap;
use store_api::metric_engine_consts::{ use store_api::metric_engine_consts::{
MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING,
METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION, METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION,
METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION_DEFAULT, METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION_DEFAULT,
METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION, METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION,
METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION_DEFAULT, METRIC_ENGINE_INDEX_TYPE_OPTION, METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION_DEFAULT, METRIC_ENGINE_INDEX_TYPE_OPTION,
}; };
use store_api::mito_engine_options::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING;
use crate::error::{Error, ParseRegionOptionsSnafu, Result}; use crate::error::{Error, ParseRegionOptionsSnafu, Result};

View File

@@ -16,15 +16,13 @@ use api::v1::{Rows, WriteHint};
use common_telemetry::{error, info}; use common_telemetry::{error, info};
use snafu::{OptionExt, ensure}; use snafu::{OptionExt, ensure};
use store_api::codec::PrimaryKeyEncoding; use store_api::codec::PrimaryKeyEncoding;
use store_api::region_request::{ use store_api::region_request::{AffectedRows, RegionPutRequest};
AffectedRows, RegionDeleteRequest, RegionPutRequest, RegionRequest,
};
use store_api::storage::{RegionId, TableId}; use store_api::storage::{RegionId, TableId};
use crate::engine::MetricEngineInner; use crate::engine::MetricEngineInner;
use crate::error::{ use crate::error::{
ColumnNotFoundSnafu, ForbiddenPhysicalAlterSnafu, LogicalRegionNotFoundSnafu, ColumnNotFoundSnafu, ForbiddenPhysicalAlterSnafu, LogicalRegionNotFoundSnafu,
PhysicalRegionNotFoundSnafu, Result, UnsupportedRegionRequestSnafu, PhysicalRegionNotFoundSnafu, Result,
}; };
use crate::metrics::{FORBIDDEN_OPERATION_COUNT, MITO_OPERATION_ELAPSED}; use crate::metrics::{FORBIDDEN_OPERATION_COUNT, MITO_OPERATION_ELAPSED};
use crate::row_modifier::RowsIter; use crate::row_modifier::RowsIter;
@@ -52,27 +50,6 @@ impl MetricEngineInner {
} }
} }
/// Dispatch region delete request
pub async fn delete_region(
&self,
region_id: RegionId,
request: RegionDeleteRequest,
) -> Result<AffectedRows> {
if self.is_physical_region(region_id) {
info!(
"Metric region received delete request {request:?} on physical region {region_id:?}"
);
FORBIDDEN_OPERATION_COUNT.inc();
UnsupportedRegionRequestSnafu {
request: RegionRequest::Delete(request),
}
.fail()
} else {
self.delete_logical_region(region_id, request).await
}
}
async fn put_logical_region( async fn put_logical_region(
&self, &self,
logical_region_id: RegionId, logical_region_id: RegionId,
@@ -82,13 +59,30 @@ impl MetricEngineInner {
.with_label_values(&["put"]) .with_label_values(&["put"])
.start_timer(); .start_timer();
let (physical_region_id, data_region_id, primary_key_encoding) = let (physical_region_id, data_region_id, primary_key_encoding) = {
self.find_data_region_meta(logical_region_id)?; let state = self.state.read().unwrap();
let physical_region_id = *state
.logical_regions()
.get(&logical_region_id)
.with_context(|| LogicalRegionNotFoundSnafu {
region_id: logical_region_id,
})?;
let data_region_id = to_data_region_id(physical_region_id);
self.verify_rows(logical_region_id, physical_region_id, &request.rows) let primary_key_encoding = state.get_primary_key_encoding(data_region_id).context(
PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
},
)?;
(physical_region_id, data_region_id, primary_key_encoding)
};
self.verify_put_request(logical_region_id, physical_region_id, &request)
.await?; .await?;
// write to data region // write to data region
// TODO: retrieve table name // TODO: retrieve table name
self.modify_rows( self.modify_rows(
physical_region_id, physical_region_id,
@@ -101,74 +95,19 @@ impl MetricEngineInner {
primary_key_encoding: api::v1::PrimaryKeyEncoding::Sparse.into(), primary_key_encoding: api::v1::PrimaryKeyEncoding::Sparse.into(),
}); });
} }
self.data_region self.data_region.write_data(data_region_id, request).await
.write_data(data_region_id, RegionRequest::Put(request))
.await
} }
async fn delete_logical_region( /// Verifies a put request for a logical region against its corresponding metadata region.
&self,
logical_region_id: RegionId,
mut request: RegionDeleteRequest,
) -> Result<AffectedRows> {
let _timer = MITO_OPERATION_ELAPSED
.with_label_values(&["delete"])
.start_timer();
let (physical_region_id, data_region_id, primary_key_encoding) =
self.find_data_region_meta(logical_region_id)?;
self.verify_rows(logical_region_id, physical_region_id, &request.rows)
.await?;
// write to data region
// TODO: retrieve table name
self.modify_rows(
physical_region_id,
logical_region_id.table_id(),
&mut request.rows,
primary_key_encoding,
)?;
if primary_key_encoding == PrimaryKeyEncoding::Sparse {
request.hint = Some(WriteHint {
primary_key_encoding: api::v1::PrimaryKeyEncoding::Sparse.into(),
});
}
self.data_region
.write_data(data_region_id, RegionRequest::Delete(request))
.await
}
fn find_data_region_meta(
&self,
logical_region_id: RegionId,
) -> Result<(RegionId, RegionId, PrimaryKeyEncoding)> {
let state = self.state.read().unwrap();
let physical_region_id = *state
.logical_regions()
.get(&logical_region_id)
.with_context(|| LogicalRegionNotFoundSnafu {
region_id: logical_region_id,
})?;
let data_region_id = to_data_region_id(physical_region_id);
let primary_key_encoding = state.get_primary_key_encoding(data_region_id).context(
PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
},
)?;
Ok((physical_region_id, data_region_id, primary_key_encoding))
}
/// Verifies a request for a logical region against its corresponding metadata region.
/// ///
/// Includes: /// Includes:
/// - Check if the logical region exists /// - Check if the logical region exists
/// - Check if the columns exist /// - Check if the columns exist
async fn verify_rows( async fn verify_put_request(
&self, &self,
logical_region_id: RegionId, logical_region_id: RegionId,
physical_region_id: RegionId, physical_region_id: RegionId,
rows: &Rows, request: &RegionPutRequest,
) -> Result<()> { ) -> Result<()> {
// Check if the region exists // Check if the region exists
let data_region_id = to_data_region_id(physical_region_id); let data_region_id = to_data_region_id(physical_region_id);
@@ -189,7 +128,7 @@ impl MetricEngineInner {
region_id: data_region_id, region_id: data_region_id,
})? })?
.physical_columns(); .physical_columns();
for col in &rows.schema { for col in &request.rows.schema {
ensure!( ensure!(
physical_columns.contains_key(&col.column_name), physical_columns.contains_key(&col.column_name),
ColumnNotFoundSnafu { ColumnNotFoundSnafu {

View File

@@ -50,13 +50,6 @@ pub enum Error {
location: Location, location: Location,
}, },
#[snafu(display("Failed to batch catchup mito region"))]
BatchCatchupMitoRegion {
source: BoxedError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("No open region result for region {}", region_id))] #[snafu(display("No open region result for region {}", region_id))]
NoOpenRegionResult { NoOpenRegionResult {
region_id: RegionId, region_id: RegionId,
@@ -149,6 +142,20 @@ pub enum Error {
location: Location, location: Location,
}, },
#[snafu(display("Mito delete operation fails"))]
MitoDeleteOperation {
source: BoxedError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Mito catchup operation fails"))]
MitoCatchupOperation {
source: BoxedError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Mito sync operation fails"))] #[snafu(display("Mito sync operation fails"))]
MitoSyncOperation { MitoSyncOperation {
source: BoxedError, source: BoxedError,
@@ -350,10 +357,11 @@ impl ErrorExt for Error {
| CloseMitoRegion { source, .. } | CloseMitoRegion { source, .. }
| MitoReadOperation { source, .. } | MitoReadOperation { source, .. }
| MitoWriteOperation { source, .. } | MitoWriteOperation { source, .. }
| MitoCatchupOperation { source, .. }
| MitoFlushOperation { source, .. } | MitoFlushOperation { source, .. }
| MitoDeleteOperation { source, .. }
| MitoSyncOperation { source, .. } | MitoSyncOperation { source, .. }
| BatchOpenMitoRegion { source, .. } | BatchOpenMitoRegion { source, .. } => source.status_code(),
| BatchCatchupMitoRegion { source, .. } => source.status_code(),
EncodePrimaryKey { source, .. } => source.status_code(), EncodePrimaryKey { source, .. } => source.status_code(),

View File

@@ -536,7 +536,7 @@ impl MetadataRegion {
.collect(); .collect();
let rows = Rows { schema: cols, rows }; let rows = Rows { schema: cols, rows };
RegionDeleteRequest { rows, hint: None } RegionDeleteRequest { rows }
} }
/// Add logical regions to the metadata region. /// Add logical regions to the metadata region.

View File

@@ -76,17 +76,6 @@ impl TestEnv {
} }
} }
/// Returns a new env with specific `prefix` and `mito_env` for test.
pub async fn with_mito_env(mut mito_env: MitoTestEnv) -> Self {
let mito = mito_env.create_engine(MitoConfig::default()).await;
let metric = MetricEngine::try_new(mito.clone(), EngineConfig::default()).unwrap();
Self {
mito_env,
mito,
metric,
}
}
pub fn data_home(&self) -> String { pub fn data_home(&self) -> String {
let env_root = self.mito_env.data_home().to_string_lossy().to_string(); let env_root = self.mito_env.data_home().to_string_lossy().to_string();
join_dir(&env_root, "data") join_dir(&env_root, "data")
@@ -136,12 +125,7 @@ impl TestEnv {
} }
/// Create regions in [MetricEngine] with specific `physical_region_id`. /// Create regions in [MetricEngine] with specific `physical_region_id`.
pub async fn create_physical_region( pub async fn create_physical_region(&self, physical_region_id: RegionId, table_dir: &str) {
&self,
physical_region_id: RegionId,
table_dir: &str,
options: Vec<(String, String)>,
) {
let region_create_request = RegionCreateRequest { let region_create_request = RegionCreateRequest {
engine: METRIC_ENGINE_NAME.to_string(), engine: METRIC_ENGINE_NAME.to_string(),
column_metadatas: vec![ column_metadatas: vec![
@@ -167,7 +151,6 @@ impl TestEnv {
primary_key: vec![], primary_key: vec![],
options: [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())] options: [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())]
.into_iter() .into_iter()
.chain(options.into_iter())
.collect(), .collect(),
table_dir: table_dir.to_string(), table_dir: table_dir.to_string(),
path_type: PathType::Bare, // Use Bare path type for engine regions path_type: PathType::Bare, // Use Bare path type for engine regions
@@ -248,7 +231,7 @@ impl TestEnv {
/// under [`default_logical_region_id`]. /// under [`default_logical_region_id`].
pub async fn init_metric_region(&self) { pub async fn init_metric_region(&self) {
let physical_region_id = self.default_physical_region_id(); let physical_region_id = self.default_physical_region_id();
self.create_physical_region(physical_region_id, &Self::default_table_dir(), vec![]) self.create_physical_region(physical_region_id, &Self::default_table_dir())
.await; .await;
let logical_region_id = self.default_logical_region_id(); let logical_region_id = self.default_logical_region_id();
self.create_logical_region(physical_region_id, logical_region_id) self.create_logical_region(physical_region_id, logical_region_id)
@@ -441,22 +424,6 @@ pub fn build_rows(num_tags: usize, num_rows: usize) -> Vec<Row> {
rows rows
} }
#[macro_export]
/// Skip the test if the environment variable `GT_KAFKA_ENDPOINTS` is not set.
///
/// The format of the environment variable is:
/// ```text
/// GT_KAFKA_ENDPOINTS=localhost:9092,localhost:9093
/// ```
macro_rules! maybe_skip_kafka_log_store_integration_test {
() => {
if std::env::var("GT_KAFKA_ENDPOINTS").is_err() {
common_telemetry::warn!("The kafka endpoints is empty, skipping the test");
return;
}
};
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use object_store::ObjectStore; use object_store::ObjectStore;

View File

@@ -278,41 +278,14 @@ impl SparseReadRowHelper {
primary_key_encoding: PrimaryKeyEncoding, primary_key_encoding: PrimaryKeyEncoding,
) -> SparseReadRowHelper { ) -> SparseReadRowHelper {
if primary_key_encoding == PrimaryKeyEncoding::Sparse { if primary_key_encoding == PrimaryKeyEncoding::Sparse {
// Optimized case: when schema has exactly 3 columns (primary key, timestamp, and one field), // We can skip build the indices for sparse primary key encoding.
// we can directly use their indices in order without building an explicit mapping. // The order of the columns is encoded primary key, timestamp, field columns.
// The column order is: encoded primary key, timestamp, and field. let indices = rows
if rows.schema.len() == 3 {
let indices = rows
.schema
.iter()
.enumerate()
.map(|(index, _)| Some(index))
.collect();
return SparseReadRowHelper {
indices,
num_primary_key_column: 1,
};
};
let mut indices = Vec::with_capacity(rows.schema.len());
let name_to_index: HashMap<_, _> = rows
.schema .schema
.iter() .iter()
.enumerate() .enumerate()
.map(|(index, col)| (&col.column_name, index)) .map(|(index, _)| Some(index))
.collect(); .collect();
indices.extend(
rows.schema[0..2]
.iter()
.enumerate()
.map(|(index, _)| Some(index)),
);
// Iterate columns and find field columns.
for column in metadata.field_columns() {
// Get index in request for each field column.
let index = name_to_index.get(&column.column_schema.name);
indices.push(index.copied());
}
return SparseReadRowHelper { return SparseReadRowHelper {
indices, indices,
num_primary_key_column: 1, num_primary_key_column: 1,

View File

@@ -477,8 +477,6 @@ fn flat_merge_iterator_bench(c: &mut Criterion) {
bulk_part.batch.clone(), bulk_part.batch.clone(),
context.clone(), context.clone(),
None, // No sequence filter None, // No sequence filter
1024, // 1024 hosts per part
None, // No mem_scan_metrics
); );
iters.push(Box::new(iter) as _); iters.push(Box::new(iter) as _);
} }
@@ -536,13 +534,8 @@ fn bulk_part_record_batch_iter_filter(c: &mut Criterion) {
); );
// Create and iterate over BulkPartRecordBatchIter with filter // Create and iterate over BulkPartRecordBatchIter with filter
let iter = BulkPartRecordBatchIter::new( let iter =
record_batch_with_filter.clone(), BulkPartRecordBatchIter::new(record_batch_with_filter.clone(), context, None);
context,
None, // No sequence filter
4096, // 4096 hosts
None, // No mem_scan_metrics
);
// Consume all batches // Consume all batches
for batch_result in iter { for batch_result in iter {
@@ -566,13 +559,7 @@ fn bulk_part_record_batch_iter_filter(c: &mut Criterion) {
); );
// Create and iterate over BulkPartRecordBatchIter // Create and iterate over BulkPartRecordBatchIter
let iter = BulkPartRecordBatchIter::new( let iter = BulkPartRecordBatchIter::new(record_batch_no_filter.clone(), context, None);
record_batch_no_filter.clone(),
context,
None, // No sequence filter
4096, // 4096 hosts
None, // No mem_scan_metrics
);
// Consume all batches // Consume all batches
for batch_result in iter { for batch_result in iter {

View File

@@ -20,11 +20,12 @@ use criterion::{Criterion, black_box, criterion_group, criterion_main};
use datatypes::data_type::ConcreteDataType; use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema; use datatypes::schema::ColumnSchema;
use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable; use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable;
use mito2::memtable::{KeyValues, Memtable, MemtableRanges, RangesOptions}; use mito2::memtable::{KeyValues, Memtable, MemtableRanges};
use mito2::read; use mito2::read;
use mito2::read::Source; use mito2::read::Source;
use mito2::read::dedup::DedupReader; use mito2::read::dedup::DedupReader;
use mito2::read::merge::MergeReaderBuilder; use mito2::read::merge::MergeReaderBuilder;
use mito2::read::scan_region::PredicateGroup;
use mito2::region::options::MergeMode; use mito2::region::options::MergeMode;
use mito2::test_util::column_metadata_to_column_schema; use mito2::test_util::column_metadata_to_column_schema;
use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
@@ -125,7 +126,9 @@ fn create_memtable_with_rows(num_batches: usize) -> SimpleBulkMemtable {
} }
async fn flush(mem: &SimpleBulkMemtable) { async fn flush(mem: &SimpleBulkMemtable) {
let MemtableRanges { ranges, .. } = mem.ranges(None, RangesOptions::for_flush()).unwrap(); let MemtableRanges { ranges, .. } = mem
.ranges(None, PredicateGroup::default(), None, true)
.unwrap();
let mut source = if ranges.len() == 1 { let mut source = if ranges.len() == 1 {
let only_range = ranges.into_values().next().unwrap(); let only_range = ranges.into_values().next().unwrap();

View File

@@ -213,11 +213,7 @@ impl AccessLayer {
} }
/// Deletes a SST file (and its index file if it has one) with given file id. /// Deletes a SST file (and its index file if it has one) with given file id.
pub(crate) async fn delete_sst( pub(crate) async fn delete_sst(&self, region_file_id: &RegionFileId) -> Result<()> {
&self,
region_file_id: &RegionFileId,
index_file_id: &RegionFileId,
) -> Result<()> {
let path = location::sst_file_path(&self.table_dir, *region_file_id, self.path_type); let path = location::sst_file_path(&self.table_dir, *region_file_id, self.path_type);
self.object_store self.object_store
.delete(&path) .delete(&path)
@@ -226,7 +222,7 @@ impl AccessLayer {
file_id: region_file_id.file_id(), file_id: region_file_id.file_id(),
})?; })?;
let path = location::index_file_path(&self.table_dir, *index_file_id, self.path_type); let path = location::index_file_path(&self.table_dir, *region_file_id, self.path_type);
self.object_store self.object_store
.delete(&path) .delete(&path)
.await .await

Some files were not shown because too many files have changed in this diff Show More