bytes trace

Signed-off-by: discord9 <discord9@163.com>
2026-01-05 21:02:58 +00:00 · 2025-11-04 11:19:07 +08:00
251 changed files with 2670 additions and 10963 deletions
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -613,9 +613,6 @@ jobs:
          - name: "MySQL Kvbackend"
            opts: "--setup-mysql"
            kafka: false
          - name: "Flat format"
            opts: "--enable-flat-format"
            kafka: false
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
@@ -811,7 +808,7 @@ jobs:
      - name: Setup external services
        working-directory: tests-integration/fixtures
        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
-
+        
      - name: Run nextest cases
        run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend
        env:
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -92,6 +92,5 @@ jobs:
        mode:
          - name: "Basic"
          - name: "Remote WAL"
          - name: "Flat format"
    steps:
      - run: 'echo "No action required"'
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -212,9 +212,8 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
 [[package]]
 name = "api"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "arrow-schema",
 "common-base",
 "common-decimal",
 "common-error",
@@ -733,7 +732,7 @@ dependencies = [
 [[package]]
 name = "auth"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -1337,9 +1336,13 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 [[package]]
 name = "bytes"
 version = "1.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
+source = "git+https://github.com/discord9/bytes?rev=1572ab22c3cbad0e9b6681d1f68eca4139322a2a#1572ab22c3cbad0e9b6681d1f68eca4139322a2a"
 checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
 dependencies = [
 "backtrace",
 "crossbeam-channel",
 "inferno 0.12.2",
 "papaya",
 "quanta",
 "serde",
 ]
@@ -1383,7 +1386,7 @@ dependencies = [
 [[package]]
 name = "cache"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "catalog",
 "common-error",
@@ -1418,7 +1421,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 [[package]]
 name = "catalog"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arrow",
@@ -1630,7 +1633,6 @@ dependencies = [
 "chrono",
 "chrono-tz-build",
 "phf 0.11.3",
 "uncased",
 ]
 [[package]]
@@ -1641,8 +1643,6 @@ checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402"
 dependencies = [
 "parse-zoneinfo",
 "phf_codegen 0.11.3",
 "phf_shared 0.11.3",
 "uncased",
 ]
 [[package]]
@@ -1763,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
 [[package]]
 name = "cli"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -1816,7 +1816,7 @@ dependencies = [
 [[package]]
 name = "client"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -1848,8 +1848,8 @@ dependencies = [
 "serde_json",
 "snafu 0.8.6",
 "store-api",
 "substrait 0.18.0",
 "substrait 0.37.3",
 "substrait 1.0.0-beta.1",
 "tokio",
 "tokio-stream",
 "tonic 0.13.1",
@@ -1889,7 +1889,7 @@ dependencies = [
 [[package]]
 name = "cmd"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-trait",
 "auth",
@@ -2012,7 +2012,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
 [[package]]
 name = "common-base"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "anymap2",
 "async-trait",
@@ -2036,14 +2036,14 @@ dependencies = [
 [[package]]
 name = "common-catalog"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "const_format",
 ]
 [[package]]
 name = "common-config"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-base",
 "common-error",
@@ -2067,7 +2067,7 @@ dependencies = [
 [[package]]
 name = "common-datasource"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "arrow",
 "arrow-schema",
@@ -2102,7 +2102,7 @@ dependencies = [
 [[package]]
 name = "common-decimal"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "bigdecimal 0.4.8",
 "common-error",
@@ -2115,7 +2115,7 @@ dependencies = [
 [[package]]
 name = "common-error"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-macro",
 "http 1.3.1",
@@ -2126,7 +2126,7 @@ dependencies = [
 [[package]]
 name = "common-event-recorder"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -2148,7 +2148,7 @@ dependencies = [
 [[package]]
 name = "common-frontend"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -2170,7 +2170,7 @@ dependencies = [
 [[package]]
 name = "common-function"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -2229,7 +2229,7 @@ dependencies = [
 [[package]]
 name = "common-greptimedb-telemetry"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-trait",
 "common-runtime",
@@ -2246,7 +2246,7 @@ dependencies = [
 [[package]]
 name = "common-grpc"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arrow-flight",
@@ -2279,7 +2279,7 @@ dependencies = [
 [[package]]
 name = "common-grpc-expr"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "common-base",
@@ -2299,7 +2299,7 @@ dependencies = [
 [[package]]
 name = "common-macro"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "greptime-proto",
 "once_cell",
@@ -2310,7 +2310,7 @@ dependencies = [
 [[package]]
 name = "common-mem-prof"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "anyhow",
 "common-error",
@@ -2326,7 +2326,7 @@ dependencies = [
 [[package]]
 name = "common-meta"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "anymap2",
 "api",
@@ -2398,7 +2398,7 @@ dependencies = [
 [[package]]
 name = "common-options"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-grpc",
 "humantime-serde",
@@ -2407,11 +2407,11 @@ dependencies = [
 [[package]]
 name = "common-plugins"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 [[package]]
 name = "common-pprof"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-error",
 "common-macro",
@@ -2423,7 +2423,7 @@ dependencies = [
 [[package]]
 name = "common-procedure"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-stream",
@@ -2452,7 +2452,7 @@ dependencies = [
 [[package]]
 name = "common-procedure-test"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-trait",
 "common-procedure",
@@ -2462,7 +2462,7 @@ dependencies = [
 [[package]]
 name = "common-query"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -2488,7 +2488,7 @@ dependencies = [
 [[package]]
 name = "common-recordbatch"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "arc-swap",
 "common-base",
@@ -2512,7 +2512,7 @@ dependencies = [
 [[package]]
 name = "common-runtime"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-trait",
 "clap 4.5.40",
@@ -2541,7 +2541,7 @@ dependencies = [
 [[package]]
 name = "common-session"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "serde",
 "strum 0.27.1",
@@ -2549,7 +2549,7 @@ dependencies = [
 [[package]]
 name = "common-sql"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-base",
 "common-decimal",
@@ -2567,7 +2567,7 @@ dependencies = [
 [[package]]
 name = "common-stat"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-base",
 "common-runtime",
@@ -2582,7 +2582,7 @@ dependencies = [
 [[package]]
 name = "common-telemetry"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "backtrace",
 "common-base",
@@ -2611,7 +2611,7 @@ dependencies = [
 [[package]]
 name = "common-test-util"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "client",
 "common-grpc",
@@ -2624,7 +2624,7 @@ dependencies = [
 [[package]]
 name = "common-time"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "arrow",
 "chrono",
@@ -2642,7 +2642,7 @@ dependencies = [
 [[package]]
 name = "common-version"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "build-data",
 "cargo-manifest",
@@ -2653,7 +2653,7 @@ dependencies = [
 [[package]]
 name = "common-wal"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-base",
 "common-error",
@@ -2676,7 +2676,7 @@ dependencies = [
 [[package]]
 name = "common-workload"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "common-telemetry",
 "serde",
@@ -3913,7 +3913,7 @@ dependencies = [
 [[package]]
 name = "datanode"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arrow-flight",
@@ -3977,7 +3977,7 @@ dependencies = [
 [[package]]
 name = "datatypes"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "arrow",
 "arrow-array",
@@ -4649,7 +4649,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 [[package]]
 name = "file-engine"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -4781,7 +4781,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
 [[package]]
 name = "flow"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arrow",
@@ -4850,7 +4850,7 @@ dependencies = [
 "sql",
 "store-api",
 "strum 0.27.1",
- "substrait 1.0.0-beta.1",
+ "substrait 0.18.0",
 "table",
 "tokio",
 "tonic 0.13.1",
@@ -4905,7 +4905,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619"
 [[package]]
 name = "frontend"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -6116,7 +6116,7 @@ dependencies = [
 [[package]]
 name = "index"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-trait",
 "asynchronous-codec",
@@ -7045,7 +7045,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
 [[package]]
 name = "log-query"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "chrono",
 "common-error",
@@ -7057,7 +7057,7 @@ dependencies = [
 [[package]]
 name = "log-store"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -7364,7 +7364,7 @@ dependencies = [
 [[package]]
 name = "meta-client"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -7392,7 +7392,7 @@ dependencies = [
 [[package]]
 name = "meta-srv"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -7490,7 +7490,7 @@ dependencies = [
 [[package]]
 name = "metric-engine"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "aquamarine",
@@ -7508,7 +7508,6 @@ dependencies = [
 "common-telemetry",
 "common-test-util",
 "common-time",
 "common-wal",
 "datafusion",
 "datatypes",
 "futures-util",
@@ -7585,7 +7584,7 @@ dependencies = [
 [[package]]
 name = "mito-codec"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "bytes",
@@ -7610,7 +7609,7 @@ dependencies = [
 [[package]]
 name = "mito2"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "aquamarine",
@@ -8348,7 +8347,7 @@ dependencies = [
 [[package]]
 name = "object-store"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "anyhow",
 "bytes",
@@ -8633,7 +8632,7 @@ dependencies = [
 [[package]]
 name = "operator"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -8691,7 +8690,7 @@ dependencies = [
 "sql",
 "sqlparser",
 "store-api",
- "substrait 1.0.0-beta.1",
+ "substrait 0.18.0",
 "table",
 "tokio",
 "tokio-util",
@@ -8867,6 +8866,16 @@ dependencies = [
 "unicode-width 0.1.14",
 ]
 [[package]]
 name = "papaya"
 version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f"
 dependencies = [
 "equivalent",
 "seize",
 ]
 [[package]]
 name = "parking"
 version = "2.2.1"
@@ -8977,7 +8986,7 @@ dependencies = [
 [[package]]
 name = "partition"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -9276,7 +9285,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
 dependencies = [
 "siphasher",
 "uncased",
 ]
 [[package]]
@@ -9322,7 +9330,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 [[package]]
 name = "pipeline"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -9478,7 +9486,7 @@ dependencies = [
 [[package]]
 name = "plugins"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "auth",
 "clap 4.5.40",
@@ -9778,7 +9786,7 @@ dependencies = [
 [[package]]
 name = "promql"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "ahash 0.8.12",
 "async-trait",
@@ -10061,7 +10069,7 @@ dependencies = [
 [[package]]
 name = "puffin"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-compression 0.4.19",
 "async-trait",
@@ -10101,9 +10109,24 @@ dependencies = [
 "variadics",
 ]
 [[package]]
 name = "quanta"
 version = "0.12.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
 dependencies = [
 "crossbeam-utils",
 "libc",
 "once_cell",
 "raw-cpuid",
 "wasi 0.11.1+wasi-snapshot-preview1",
 "web-sys",
 "winapi",
 ]
 [[package]]
 name = "query"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -10127,7 +10150,6 @@ dependencies = [
 "common-query",
 "common-recordbatch",
 "common-runtime",
 "common-stat",
 "common-telemetry",
 "common-time",
 "datafusion",
@@ -10170,7 +10192,7 @@ dependencies = [
 "sql",
 "sqlparser",
 "store-api",
- "substrait 1.0.0-beta.1",
+ "substrait 0.18.0",
 "table",
 "tokio",
 "tokio-stream",
@@ -10401,6 +10423,15 @@ dependencies = [
 "thiserror 1.0.69",
 ]
 [[package]]
 name = "raw-cpuid"
 version = "11.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
 dependencies = [
 "bitflags 2.9.1",
 ]
 [[package]]
 name = "rawpointer"
 version = "0.2.1"
@@ -11341,6 +11372,16 @@ dependencies = [
 "libc",
 ]
 [[package]]
 name = "seize"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b55fb86dfd3a2f5f76ea78310a88f96c4ea21a3031f8d212443d56123fd0521"
 dependencies = [
 "libc",
 "windows-sys 0.52.0",
 ]
 [[package]]
 name = "semver"
 version = "1.0.26"
@@ -11506,7 +11547,7 @@ dependencies = [
 [[package]]
 name = "servers"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -11632,7 +11673,7 @@ dependencies = [
 [[package]]
 name = "session"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -11966,7 +12007,7 @@ dependencies = [
 [[package]]
 name = "sql"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arrow-buffer",
@@ -12026,7 +12067,7 @@ dependencies = [
 [[package]]
 name = "sqlness-runner"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-trait",
 "clap 4.5.40",
@@ -12303,7 +12344,7 @@ dependencies = [
 [[package]]
 name = "standalone"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "async-trait",
 "catalog",
@@ -12344,7 +12385,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 [[package]]
 name = "store-api"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "aquamarine",
@@ -12509,6 +12550,28 @@ dependencies = [
 "winapi",
 ]
 [[package]]
 name = "substrait"
 version = "0.18.0"
 dependencies = [
 "async-trait",
 "bytes",
 "common-error",
 "common-function",
 "common-macro",
 "common-telemetry",
 "datafusion",
 "datafusion-common",
 "datafusion-expr",
 "datafusion-substrait",
 "datatypes",
 "promql",
 "prost 0.13.5",
 "snafu 0.8.6",
 "substrait 0.37.3",
 "tokio",
 ]
 [[package]]
 name = "substrait"
 version = "0.37.3"
@@ -12555,28 +12618,6 @@ dependencies = [
 "walkdir",
 ]
 [[package]]
 name = "substrait"
 version = "1.0.0-beta.1"
 dependencies = [
 "async-trait",
 "bytes",
 "common-error",
 "common-function",
 "common-macro",
 "common-telemetry",
 "datafusion",
 "datafusion-common",
 "datafusion-expr",
 "datafusion-substrait",
 "datatypes",
 "promql",
 "prost 0.13.5",
 "snafu 0.8.6",
 "substrait 0.37.3",
 "tokio",
 ]
 [[package]]
 name = "subtle"
 version = "2.6.1"
@@ -12680,7 +12721,7 @@ dependencies = [
 [[package]]
 name = "table"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "async-trait",
@@ -12949,7 +12990,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
 [[package]]
 name = "tests-fuzz"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "arbitrary",
 "async-trait",
@@ -12993,7 +13034,7 @@ dependencies = [
 [[package]]
 name = "tests-integration"
-version = "1.0.0-beta.1"
+version = "0.18.0"
 dependencies = [
 "api",
 "arrow-flight",
@@ -13067,7 +13108,7 @@ dependencies = [
 "sqlx",
 "standalone",
 "store-api",
- "substrait 1.0.0-beta.1",
+ "substrait 0.18.0",
 "table",
 "tempfile",
 "time",
@@ -13977,15 +14018,6 @@ dependencies = [
 "serde",
 ]
 [[package]]
 name = "uncased"
 version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697"
 dependencies = [
 "version_check",
 ]
 [[package]]
 name = "unescaper"
 version = "0.1.6"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -74,7 +74,7 @@ members = [
 resolver = "2"
 [workspace.package]
-version = "1.0.0-beta.1"
+version = "0.18.0"
 edition = "2024"
 license = "Apache-2.0"
@@ -118,7 +118,7 @@ bitflags = "2.4.1"
 bytemuck = "1.12"
 bytes = { version = "1.7", features = ["serde"] }
 chrono = { version = "0.4", features = ["serde"] }
-chrono-tz = { version = "0.10.1", features = ["case-insensitive"] }
+chrono-tz = "0.10.1"
 clap = { version = "4.4", features = ["derive"] }
 config = "0.13.0"
 const_format = "0.2"
@@ -219,7 +219,12 @@ similar-asserts = "1.6.0"
 smallvec = { version = "1", features = ["serde"] }
 snafu = "0.8"
 sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] }
-sqlx = { version = "0.8", default-features = false, features = ["any", "macros", "json", "runtime-tokio-rustls"] }
+sqlx = { version = "0.8", features = [
    "runtime-tokio-rustls",
    "mysql",
    "postgres",
    "chrono",
 ] }
 strum = { version = "0.27", features = ["derive"] }
 sysinfo = "0.33"
 tempfile = "3"
@@ -328,6 +333,7 @@ datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git"
 datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
 datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
 sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" }                           # branch = "v0.58.x"
 bytes = { git = "https://github.com/discord9/bytes", rev = "1572ab22c3cbad0e9b6681d1f68eca4139322a2a" }
 [profile.release]
 debug = 1
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@
 <div align="center">
 <h3 align="center">
  <a href="https://greptime.com/product/cloud">GreptimeCloud</a> |
  <a href="https://docs.greptime.com/">User Guide</a> |
  <a href="https://greptimedb.rs/">API Docs</a> |
  <a href="https://github.com/GreptimeTeam/greptimedb/issues/5446">Roadmap 2025</a>
@@ -104,6 +105,16 @@ Read [more benchmark reports](https://docs.greptime.com/user-guide/concepts/feat
 ## Try GreptimeDB
 ### 1. [Live Demo](https://greptime.com/playground)
 Experience GreptimeDB directly in your browser.
 ### 2. [GreptimeCloud](https://console.greptime.cloud/)
 Start instantly with a free cluster.
 ### 3. Docker (Local Quickstart)
 ```shell
 docker pull greptime/greptimedb
 ```
--- a/config/config.md
+++ b/config/config.md
@@ -16,7 +16,7 @@
 | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
-| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
+| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
 | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
 | `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. |
 | `runtime` | -- | -- | The runtime options. |
@@ -104,7 +104,6 @@
 | `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
 | `query` | -- | -- | The query engine options. |
 | `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
 | `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
@@ -152,13 +151,10 @@
 | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
 | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
 | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
 | `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
 | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
 | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
@@ -192,7 +188,7 @@
 | `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
 | `region_engine.file` | -- | -- | Enable the file engine. |
 | `region_engine.metric` | -- | -- | Metric engine options. |
-| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. |
+| `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
 | `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
@@ -312,7 +308,6 @@
 | `query` | -- | -- | The query engine options. |
 | `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
 | `query.allow_query_fallback` | Bool | `false` | Whether to allow query fallback when push down optimize fails.<br/>Default to false, meaning when push down optimize failed, return error msg |
 | `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans (only applies to datanodes). |
 | `datanode` | -- | -- | Datanode options. |
 | `datanode.client` | -- | -- | Datanode client options. |
 | `datanode.client.connect_timeout` | String | `10s` | -- |
@@ -451,7 +446,7 @@
 | `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. |
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
-| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
+| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
 | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
 | `http` | -- | -- | The HTTP server options. |
 | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
@@ -505,7 +500,6 @@
 | `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system<br/>can still successfully replay memtable data without throwing an<br/>out-of-range error.<br/>However, enabling this option might lead to unexpected data loss,<br/>as the system will skip over missing entries instead of treating<br/>them as critical errors. |
 | `query` | -- | -- | The query engine options. |
 | `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
 | `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
@@ -555,13 +549,10 @@
 | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
 | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
 | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
 | `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
 | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
 | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
@@ -595,7 +586,7 @@
 | `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
 | `region_engine.file` | -- | -- | Enable the file engine. |
 | `region_engine.metric` | -- | -- | Metric engine options. |
-| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. |
+| `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
 | `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
@@ -682,6 +673,5 @@
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
 | `query` | -- | -- | -- |
 | `query.parallelism` | Integer | `1` | Parallelism of the query engine for query sent by flownode.<br/>Default to 1, so it won't use too much cpu or memory |
 | `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
 | `memory` | -- | -- | The memory options. |
 | `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -18,9 +18,6 @@ init_regions_in_background = false
 init_regions_parallelism = 16
 ## The maximum current queries allowed to be executed. Zero means unlimited.
 ## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
 ## When set, 70% of queries get privileged memory access (full scan_memory_limit).
 ## The remaining 30% get standard tier access (70% of scan_memory_limit).
 max_concurrent_queries = 0
 ## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -264,13 +261,6 @@ overwrite_entry_start_id = false
 ## Default to 0, which means the number of CPU cores.
 parallelism = 0
 ## Memory pool size for query execution operators (aggregation, sorting, join).
 ## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
 ## Setting it to 0 disables the limit (unbounded, default behavior).
 ## When this limit is reached, queries will fail with ResourceExhausted error.
 ## NOTE: This does NOT limit memory used by table scans.
 memory_pool_size = "50%"
 ## The data storage options.
 [storage]
 ## The working home directory.
@@ -499,17 +489,6 @@ write_cache_size = "5GiB"
 ## @toml2docs:none-default
 write_cache_ttl = "8h"
 ## Preload index (puffin) files into cache on region open (default: true).
 ## When enabled, index files are loaded into the write cache during region initialization,
 ## which can improve query performance at the cost of longer startup times.
 preload_index_cache = true
 ## Percentage of write cache capacity allocated for index (puffin) files (default: 20).
 ## The remaining capacity is used for data (parquet) files.
 ## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
@@ -522,14 +501,6 @@ max_concurrent_scan_files = 384
 ## Whether to allow stale WAL entries read during replay.
 allow_stale_entries = false
 ## Memory limit for table scans across all queries.
 ## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
 ## Setting it to 0 disables the limit.
 ## NOTE: Works with max_concurrent_queries for tiered memory allocation.
 ## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
 ## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
 scan_memory_limit = "50%"
 ## Minimum time interval between two compactions.
 ## To align with the old behavior, the default value is 0 (no restrictions).
 min_compaction_interval = "0m"
@@ -669,8 +640,8 @@ fork_dictionary_bytes = "1GiB"
 [[region_engine]]
 ## Metric engine options.
 [region_engine.metric]
-## Whether to use sparse primary key encoding.
+## Whether to enable the experimental sparse primary key encoding.
-sparse_primary_key_encoding = true
+experimental_sparse_primary_key_encoding = false
 ## The logging options.
 [logging]
--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -158,13 +158,6 @@ default_ratio = 1.0
 ## Default to 1, so it won't use too much cpu or memory
 parallelism = 1
 ## Memory pool size for query execution operators (aggregation, sorting, join).
 ## Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%").
 ## Setting it to 0 disables the limit (unbounded, default behavior).
 ## When this limit is reached, queries will fail with ResourceExhausted error.
 ## NOTE: This does NOT limit memory used by table scans.
 memory_pool_size = "50%"
 ## The memory options.
 [memory]
 ## Whether to enable heap profiling activation during startup.
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -256,13 +256,6 @@ parallelism = 0
 ## Default to false, meaning when push down optimize failed, return error msg
 allow_query_fallback = false
 ## Memory pool size for query execution operators (aggregation, sorting, join).
 ## Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%").
 ## Setting it to 0 disables the limit (unbounded, default behavior).
 ## When this limit is reached, queries will fail with ResourceExhausted error.
 ## NOTE: This does NOT limit memory used by table scans (only applies to datanodes).
 memory_pool_size = "50%"
 ## Datanode options.
 [datanode]
 ## Datanode client options.
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -14,9 +14,6 @@ init_regions_in_background = false
 init_regions_parallelism = 16
 ## The maximum current queries allowed to be executed. Zero means unlimited.
 ## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
 ## When set, 70% of queries get privileged memory access (full scan_memory_limit).
 ## The remaining 30% get standard tier access (70% of scan_memory_limit).
 max_concurrent_queries = 0
 ## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -368,13 +365,6 @@ max_running_procedures = 128
 ## Default to 0, which means the number of CPU cores.
 parallelism = 0
 ## Memory pool size for query execution operators (aggregation, sorting, join).
 ## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
 ## Setting it to 0 disables the limit (unbounded, default behavior).
 ## When this limit is reached, queries will fail with ResourceExhausted error.
 ## NOTE: This does NOT limit memory used by table scans.
 memory_pool_size = "50%"
 ## The data storage options.
 [storage]
 ## The working home directory.
@@ -590,17 +580,6 @@ write_cache_size = "5GiB"
 ## @toml2docs:none-default
 write_cache_ttl = "8h"
 ## Preload index (puffin) files into cache on region open (default: true).
 ## When enabled, index files are loaded into the write cache during region initialization,
 ## which can improve query performance at the cost of longer startup times.
 preload_index_cache = true
 ## Percentage of write cache capacity allocated for index (puffin) files (default: 20).
 ## The remaining capacity is used for data (parquet) files.
 ## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
@@ -613,14 +592,6 @@ max_concurrent_scan_files = 384
 ## Whether to allow stale WAL entries read during replay.
 allow_stale_entries = false
 ## Memory limit for table scans across all queries.
 ## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
 ## Setting it to 0 disables the limit.
 ## NOTE: Works with max_concurrent_queries for tiered memory allocation.
 ## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
 ## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
 scan_memory_limit = "50%"
 ## Minimum time interval between two compactions.
 ## To align with the old behavior, the default value is 0 (no restrictions).
 min_compaction_interval = "0m"
@@ -760,8 +731,8 @@ fork_dictionary_bytes = "1GiB"
 [[region_engine]]
 ## Metric engine options.
 [region_engine.metric]
-## Whether to use sparse primary key encoding.
+## Whether to enable the experimental sparse primary key encoding.
-sparse_primary_key_encoding = true
+experimental_sparse_primary_key_encoding = false
 ## The logging options.
 [logging]
--- a/docs/how-to/how-to-profile-memory.md
+++ b/docs/how-to/how-to-profile-memory.md
@@ -92,6 +92,9 @@ curl -X POST localhost:4000/debug/prof/mem > greptime.hprof
 curl -X POST "localhost:4000/debug/prof/mem?output=flamegraph" > greptime.svg
 # or output pprof format
 curl -X POST "localhost:4000/debug/prof/mem?output=proto" > greptime.pprof
 curl -X POST "localhost:4000/debug/prof/bytes" > greptime.svg
 ```
 You can periodically dump profiling data and compare them to find the delta memory usage.
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 workspace = true
 [dependencies]
 arrow-schema.workspace = true
 common-base.workspace = true
 common-decimal.workspace = true
 common-error.workspace = true
--- a/src/api/src/v1/column_def.rs
+++ b/src/api/src/v1/column_def.rs
@@ -14,11 +14,10 @@
 use std::collections::HashMap;
 use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};
 use datatypes::schema::{
    COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer,
-    FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions,
+    FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, JSON_STRUCTURE_SETTINGS_KEY,
-    SkippingIndexType,
+    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType,
 };
 use greptime_proto::v1::{
    Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType,
@@ -69,14 +68,8 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
        if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) {
            metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned());
        }
-        if let Some(extension_name) = options.options.get(EXTENSION_TYPE_NAME_KEY) {
+        if let Some(settings) = options.options.get(JSON_STRUCTURE_SETTINGS_KEY) {
-            metadata.insert(EXTENSION_TYPE_NAME_KEY.to_string(), extension_name.clone());
+            metadata.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
        }
        if let Some(extension_metadata) = options.options.get(EXTENSION_TYPE_METADATA_KEY) {
            metadata.insert(
                EXTENSION_TYPE_METADATA_KEY.to_string(),
                extension_metadata.clone(),
            );
        }
    }
@@ -149,16 +142,10 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<Column
            .options
            .insert(SKIPPING_INDEX_GRPC_KEY.to_string(), skipping_index.clone());
    }
-    if let Some(extension_name) = column_schema.metadata().get(EXTENSION_TYPE_NAME_KEY) {
+    if let Some(settings) = column_schema.metadata().get(JSON_STRUCTURE_SETTINGS_KEY) {
        options
            .options
-            .insert(EXTENSION_TYPE_NAME_KEY.to_string(), extension_name.clone());
+            .insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
    }
    if let Some(extension_metadata) = column_schema.metadata().get(EXTENSION_TYPE_METADATA_KEY) {
        options.options.insert(
            EXTENSION_TYPE_METADATA_KEY.to_string(),
            extension_metadata.clone(),
        );
    }
    (!options.options.is_empty()).then_some(options)
--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -97,6 +97,7 @@ lazy_static! {
        ROUTINES,
        SCHEMA_PRIVILEGES,
        TABLE_PRIVILEGES,
        TRIGGERS,
        GLOBAL_STATUS,
        SESSION_STATUS,
        PARTITIONS,
@@ -206,6 +207,7 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
            ROUTINES => setup_memory_table!(ROUTINES),
            SCHEMA_PRIVILEGES => setup_memory_table!(SCHEMA_PRIVILEGES),
            TABLE_PRIVILEGES => setup_memory_table!(TABLE_PRIVILEGES),
            TRIGGERS => setup_memory_table!(TRIGGERS),
            GLOBAL_STATUS => setup_memory_table!(GLOBAL_STATUS),
            SESSION_STATUS => setup_memory_table!(SESSION_STATUS),
            KEY_COLUMN_USAGE => Some(Arc::new(InformationSchemaKeyColumnUsage::new(
--- a/src/catalog/src/system_schema/information_schema/information_memory_table.rs
+++ b/src/catalog/src/system_schema/information_schema/information_memory_table.rs
@@ -15,7 +15,8 @@
 use std::sync::Arc;
 use common_catalog::consts::{METRIC_ENGINE, MITO_ENGINE};
-use datatypes::schema::{Schema, SchemaRef};
+use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
 use datatypes::vectors::{Int64Vector, StringVector, VectorRef};
 use crate::system_schema::information_schema::table_names::*;
@@ -365,6 +366,16 @@ pub(super) fn get_schema_columns(table_name: &str) -> (SchemaRef, Vec<VectorRef>
            vec![],
        ),
        TRIGGERS => (
            vec![
                string_column("TRIGGER_NAME"),
                ColumnSchema::new("trigger_id", ConcreteDataType::uint64_datatype(), false),
                string_column("TRIGGER_DEFINITION"),
                ColumnSchema::new("flownode_id", ConcreteDataType::uint64_datatype(), true),
            ],
            vec![],
        ),
        // TODO: Considering store internal metrics in `global_status` and
        // `session_status` tables.
        GLOBAL_STATUS => (
--- a/src/cli/src/data.rs
+++ b/src/cli/src/data.rs
@@ -23,8 +23,6 @@ use crate::Tool;
 use crate::data::export::ExportCommand;
 use crate::data::import::ImportCommand;
 pub(crate) const COPY_PATH_PLACEHOLDER: &str = "<PATH/TO/FILES>";
 /// Command for data operations including exporting data from and importing data into GreptimeDB.
 #[derive(Subcommand)]
 pub enum DataCommand {
--- a/src/cli/src/data/export.rs
+++ b/src/cli/src/data/export.rs
@@ -30,7 +30,7 @@ use snafu::{OptionExt, ResultExt};
 use tokio::sync::Semaphore;
 use tokio::time::Instant;
-use crate::data::{COPY_PATH_PLACEHOLDER, default_database};
+use crate::data::default_database;
 use crate::database::{DatabaseClient, parse_proxy_opts};
 use crate::error::{
    EmptyResultSnafu, Error, OpenDalSnafu, OutputDirNotSetSnafu, Result, S3ConfigNotSetSnafu,
@@ -668,26 +668,10 @@ impl Export {
                );
                // Create copy_from.sql file
-                let copy_database_from_sql = {
+                let copy_database_from_sql = format!(
-                    let command_without_connection = format!(
+                    r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#,
-                        r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({});"#,
+                    export_self.catalog, schema, path, with_options_clone, connection_part
-                        export_self.catalog, schema, COPY_PATH_PLACEHOLDER, with_options_clone
+                );
                    );
                    if connection_part.is_empty() {
                        command_without_connection
                    } else {
                        let command_with_connection = format!(
                            r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#,
                            export_self.catalog, schema, path, with_options_clone, connection_part
                        );
                        format!(
                            "-- {}\n{}",
                            command_with_connection, command_without_connection
                        )
                    }
                };
                let copy_from_path = export_self.get_file_path(&schema, "copy_from.sql");
                export_self
--- a/src/cli/src/data/import.rs
+++ b/src/cli/src/data/import.rs
@@ -21,13 +21,13 @@ use clap::{Parser, ValueEnum};
 use common_catalog::consts::DEFAULT_SCHEMA_NAME;
 use common_error::ext::BoxedError;
 use common_telemetry::{error, info, warn};
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};
 use tokio::sync::Semaphore;
 use tokio::time::Instant;
-use crate::data::{COPY_PATH_PLACEHOLDER, default_database};
+use crate::data::default_database;
 use crate::database::{DatabaseClient, parse_proxy_opts};
-use crate::error::{Error, FileIoSnafu, InvalidArgumentsSnafu, Result, SchemaNotFoundSnafu};
+use crate::error::{Error, FileIoSnafu, Result, SchemaNotFoundSnafu};
 use crate::{Tool, database};
 #[derive(Debug, Default, Clone, ValueEnum)]
@@ -148,15 +148,12 @@ impl Import {
                let _permit = semaphore_moved.acquire().await.unwrap();
                let database_input_dir = self.catalog_path().join(&schema);
                let sql_file = database_input_dir.join(filename);
-                let mut sql = tokio::fs::read_to_string(sql_file)
+                let sql = tokio::fs::read_to_string(sql_file)
                    .await
                    .context(FileIoSnafu)?;
-                if sql.trim().is_empty() {
+                if sql.is_empty() {
                    info!("Empty `{filename}` {database_input_dir:?}");
                } else {
                    if filename == "copy_from.sql" {
                        sql = self.rewrite_copy_database_sql(&schema, &sql)?;
                    }
                    let db = exec_db.unwrap_or(&schema);
                    self.database_client.sql(&sql, db).await?;
                    info!("Imported `{filename}` for database {schema}");
@@ -229,57 +226,6 @@ impl Import {
        }
        Ok(db_names)
    }
    fn rewrite_copy_database_sql(&self, schema: &str, sql: &str) -> Result<String> {
        let target_location = self.build_copy_database_location(schema);
        let escaped_location = target_location.replace('\'', "''");
        let mut first_stmt_checked = false;
        for line in sql.lines() {
            let trimmed = line.trim_start();
            if trimmed.is_empty() || trimmed.starts_with("--") {
                continue;
            }
            ensure!(
                trimmed.starts_with("COPY DATABASE"),
                InvalidArgumentsSnafu {
                    msg: "Expected COPY DATABASE statement at start of copy_from.sql"
                }
            );
            first_stmt_checked = true;
            break;
        }
        ensure!(
            first_stmt_checked,
            InvalidArgumentsSnafu {
                msg: "COPY DATABASE statement not found in copy_from.sql"
            }
        );
        ensure!(
            sql.contains(COPY_PATH_PLACEHOLDER),
            InvalidArgumentsSnafu {
                msg: format!(
                    "Placeholder `{}` not found in COPY DATABASE statement",
                    COPY_PATH_PLACEHOLDER
                )
            }
        );
        Ok(sql.replacen(COPY_PATH_PLACEHOLDER, &escaped_location, 1))
    }
    fn build_copy_database_location(&self, schema: &str) -> String {
        let mut path = self.catalog_path();
        path.push(schema);
        let mut path_str = path.to_string_lossy().into_owned();
        if !path_str.ends_with('/') {
            path_str.push('/');
        }
        path_str
    }
 }
 #[async_trait]
@@ -295,52 +241,3 @@ impl Tool for Import {
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use std::time::Duration;
    use super::*;
    fn build_import(input_dir: &str) -> Import {
        Import {
            catalog: "catalog".to_string(),
            schema: None,
            database_client: DatabaseClient::new(
                "127.0.0.1:4000".to_string(),
                "catalog".to_string(),
                None,
                Duration::from_secs(0),
                None,
            ),
            input_dir: input_dir.to_string(),
            parallelism: 1,
            target: ImportTarget::Data,
        }
    }
    #[test]
    fn rewrite_copy_database_sql_replaces_placeholder() {
        let import = build_import("/tmp/export-path");
        let comment = "-- COPY DATABASE \"catalog\".\"schema\" FROM 's3://bucket/demo/' WITH (format = 'parquet') CONNECTION (region = 'us-west-2')";
        let sql = format!(
            "{comment}\nCOPY DATABASE \"catalog\".\"schema\" FROM '{}' WITH (format = 'parquet');",
            COPY_PATH_PLACEHOLDER
        );
        let rewritten = import.rewrite_copy_database_sql("schema", &sql).unwrap();
        let expected_location = import.build_copy_database_location("schema");
        let escaped = expected_location.replace('\'', "''");
        assert!(rewritten.starts_with(comment));
        assert!(rewritten.contains(&format!("FROM '{escaped}'")));
        assert!(!rewritten.contains(COPY_PATH_PLACEHOLDER));
    }
    #[test]
    fn rewrite_copy_database_sql_requires_placeholder() {
        let import = build_import("/tmp/export-path");
        let sql = "COPY DATABASE \"catalog\".\"schema\" FROM '/tmp/export-path/catalog/schema/' WITH (format = 'parquet');";
        assert!(import.rewrite_copy_database_sql("schema", sql).is_err());
    }
 }
--- a/src/client/src/client.rs
+++ b/src/client/src/client.rs
@@ -20,9 +20,7 @@ use api::v1::health_check_client::HealthCheckClient;
 use api::v1::prometheus_gateway_client::PrometheusGatewayClient;
 use api::v1::region::region_client::RegionClient as PbRegionClient;
 use arrow_flight::flight_service_client::FlightServiceClient;
-use common_grpc::channel_manager::{
+use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption};
    ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
 };
 use parking_lot::RwLock;
 use snafu::{OptionExt, ResultExt};
 use tonic::codec::CompressionEncoding;
@@ -96,9 +94,8 @@ impl Client {
        A: AsRef<[U]>,
    {
        let channel_config = ChannelConfig::default().client_tls_config(client_tls);
-        let tls_config = load_tls_config(channel_config.client_tls.as_ref())
+        let channel_manager = ChannelManager::with_tls_config(channel_config)
            .context(error::CreateTlsChannelSnafu)?;
        let channel_manager = ChannelManager::with_config(channel_config, tls_config);
        Ok(Self::with_manager_and_urls(channel_manager, urls))
    }
--- a/src/client/src/client_manager.rs
+++ b/src/client/src/client_manager.rs
@@ -74,7 +74,7 @@ impl FlownodeManager for NodeClients {
 impl NodeClients {
    pub fn new(config: ChannelConfig) -> Self {
        Self {
-            channel_manager: ChannelManager::with_config(config, None),
+            channel_manager: ChannelManager::with_config(config),
            clients: CacheBuilder::new(1024)
                .time_to_live(Duration::from_secs(30 * 60))
                .time_to_idle(Duration::from_secs(5 * 60))
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -162,7 +162,6 @@ impl ObjbenchCommand {
            file_size,
            available_indexes: Default::default(),
            index_file_size: 0,
            index_file_id: None,
            num_rows,
            num_row_groups,
            sequence: None,
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -15,7 +15,6 @@
 use std::time::Duration;
 use cmd::options::GreptimeOptions;
 use common_base::memory_limit::MemoryLimit;
 use common_config::{Configurable, DEFAULT_DATA_HOME};
 use common_options::datanode::{ClientOptions, DatanodeClientOptions};
 use common_telemetry::logging::{DEFAULT_LOGGING_DIR, DEFAULT_OTLP_HTTP_ENDPOINT, LoggingOptions};
@@ -75,19 +74,14 @@ fn test_load_datanode_example_config() {
                RegionEngineConfig::Mito(MitoConfig {
                    auto_flush_interval: Duration::from_secs(3600),
                    write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
                    scan_memory_limit: MemoryLimit::Percentage(50),
                    ..Default::default()
                }),
                RegionEngineConfig::File(FileEngineConfig {}),
                RegionEngineConfig::Metric(MetricEngineConfig {
-                    sparse_primary_key_encoding: true,
+                    experimental_sparse_primary_key_encoding: false,
                    flush_metadata_region_interval: Duration::from_secs(30),
                }),
            ],
            query: QueryOptions {
                memory_pool_size: MemoryLimit::Percentage(50),
                ..Default::default()
            },
            logging: LoggingOptions {
                level: Some("info".to_string()),
                dir: format!("{}/{}", DEFAULT_DATA_HOME, DEFAULT_LOGGING_DIR),
@@ -161,10 +155,6 @@ fn test_load_frontend_example_config() {
                cors_allowed_origins: vec!["https://example.com".to_string()],
                ..Default::default()
            },
            query: QueryOptions {
                memory_pool_size: MemoryLimit::Percentage(50),
                ..Default::default()
            },
            ..Default::default()
        },
        ..Default::default()
@@ -252,7 +242,6 @@ fn test_load_flownode_example_config() {
            query: QueryOptions {
                parallelism: 1,
                allow_query_fallback: false,
                memory_pool_size: MemoryLimit::Percentage(50),
            },
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
@@ -297,12 +286,11 @@ fn test_load_standalone_example_config() {
                RegionEngineConfig::Mito(MitoConfig {
                    auto_flush_interval: Duration::from_secs(3600),
                    write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
                    scan_memory_limit: MemoryLimit::Percentage(50),
                    ..Default::default()
                }),
                RegionEngineConfig::File(FileEngineConfig {}),
                RegionEngineConfig::Metric(MetricEngineConfig {
-                    sparse_primary_key_encoding: true,
+                    experimental_sparse_primary_key_encoding: false,
                    flush_metadata_region_interval: Duration::from_secs(30),
                }),
            ],
@@ -326,10 +314,7 @@ fn test_load_standalone_example_config() {
                cors_allowed_origins: vec!["https://example.com".to_string()],
                ..Default::default()
            },
-            query: QueryOptions {
+
                memory_pool_size: MemoryLimit::Percentage(50),
                ..Default::default()
            },
            ..Default::default()
        },
        ..Default::default()
--- a/src/common/base/src/lib.rs
+++ b/src/common/base/src/lib.rs
@@ -15,7 +15,6 @@
 pub mod bit_vec;
 pub mod bytes;
 pub mod cancellation;
 pub mod memory_limit;
 pub mod plugins;
 pub mod range_read;
 #[allow(clippy::all)]
--- a/src/common/base/src/memory_limit.rs
+++ b/src/common/base/src/memory_limit.rs
@@ -1,265 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::fmt::{self, Display};
 use std::str::FromStr;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use crate::readable_size::ReadableSize;
 /// Memory limit configuration that supports both absolute size and percentage.
 ///
 /// Examples:
 /// - Absolute size: "2GB", "4GiB", "512MB"
 /// - Percentage: "50%", "75%"
 /// - Unlimited: "unlimited", "0"
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
 pub enum MemoryLimit {
    /// Absolute memory size.
    Size(ReadableSize),
    /// Percentage of total system memory (0-100).
    Percentage(u8),
    /// No memory limit.
    #[default]
    Unlimited,
 }
 impl MemoryLimit {
    /// Resolve the memory limit to bytes based on total system memory.
    /// Returns 0 if the limit is unlimited.
    pub fn resolve(&self, total_memory_bytes: u64) -> u64 {
        match self {
            MemoryLimit::Size(size) => size.as_bytes(),
            MemoryLimit::Percentage(pct) => total_memory_bytes * (*pct as u64) / 100,
            MemoryLimit::Unlimited => 0,
        }
    }
    /// Returns true if this limit is unlimited.
    pub fn is_unlimited(&self) -> bool {
        match self {
            MemoryLimit::Size(size) => size.as_bytes() == 0,
            MemoryLimit::Percentage(pct) => *pct == 0,
            MemoryLimit::Unlimited => true,
        }
    }
 }
 impl FromStr for MemoryLimit {
    type Err = String;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let s = s.trim();
        if s.eq_ignore_ascii_case("unlimited") {
            return Ok(MemoryLimit::Unlimited);
        }
        if let Some(pct_str) = s.strip_suffix('%') {
            let pct = pct_str
                .trim()
                .parse::<u8>()
                .map_err(|e| format!("invalid percentage value '{}': {}", pct_str, e))?;
            if pct > 100 {
                return Err(format!("percentage must be between 0 and 100, got {}", pct));
            }
            if pct == 0 {
                Ok(MemoryLimit::Unlimited)
            } else {
                Ok(MemoryLimit::Percentage(pct))
            }
        } else {
            let size = ReadableSize::from_str(s)?;
            if size.as_bytes() == 0 {
                Ok(MemoryLimit::Unlimited)
            } else {
                Ok(MemoryLimit::Size(size))
            }
        }
    }
 }
 impl Display for MemoryLimit {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            MemoryLimit::Size(size) => write!(f, "{}", size),
            MemoryLimit::Percentage(pct) => write!(f, "{}%", pct),
            MemoryLimit::Unlimited => write!(f, "unlimited"),
        }
    }
 }
 impl Serialize for MemoryLimit {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        serializer.serialize_str(&self.to_string())
    }
 }
 impl<'de> Deserialize<'de> for MemoryLimit {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: Deserializer<'de>,
    {
        let s = String::deserialize(deserializer)?;
        MemoryLimit::from_str(&s).map_err(serde::de::Error::custom)
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_parse_absolute_size() {
        assert_eq!(
            "2GB".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024))
        );
        assert_eq!(
            "512MB".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Size(ReadableSize(512 * 1024 * 1024))
        );
        assert_eq!("0".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
    }
    #[test]
    fn test_parse_percentage() {
        assert_eq!(
            "50%".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Percentage(50)
        );
        assert_eq!(
            "75%".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Percentage(75)
        );
        assert_eq!("0%".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
    }
    #[test]
    fn test_parse_invalid() {
        assert!("150%".parse::<MemoryLimit>().is_err());
        assert!("-10%".parse::<MemoryLimit>().is_err());
        assert!("invalid".parse::<MemoryLimit>().is_err());
    }
    #[test]
    fn test_resolve() {
        let total = 8 * 1024 * 1024 * 1024; // 8GB
        assert_eq!(
            MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024)).resolve(total),
            2 * 1024 * 1024 * 1024
        );
        assert_eq!(
            MemoryLimit::Percentage(50).resolve(total),
            4 * 1024 * 1024 * 1024
        );
        assert_eq!(MemoryLimit::Unlimited.resolve(total), 0);
    }
    #[test]
    fn test_is_unlimited() {
        assert!(MemoryLimit::Unlimited.is_unlimited());
        assert!(!MemoryLimit::Size(ReadableSize(1024)).is_unlimited());
        assert!(!MemoryLimit::Percentage(50).is_unlimited());
        assert!(!MemoryLimit::Percentage(1).is_unlimited());
        // Defensive: these states shouldn't exist via public API, but check anyway
        assert!(MemoryLimit::Size(ReadableSize(0)).is_unlimited());
        assert!(MemoryLimit::Percentage(0).is_unlimited());
    }
    #[test]
    fn test_parse_100_percent() {
        assert_eq!(
            "100%".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Percentage(100)
        );
    }
    #[test]
    fn test_display_percentage() {
        assert_eq!(MemoryLimit::Percentage(20).to_string(), "20%");
        assert_eq!(MemoryLimit::Percentage(50).to_string(), "50%");
        assert_eq!(MemoryLimit::Percentage(100).to_string(), "100%");
    }
    #[test]
    fn test_parse_unlimited() {
        assert_eq!(
            "unlimited".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Unlimited
        );
        assert_eq!(
            "UNLIMITED".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Unlimited
        );
        assert_eq!(
            "Unlimited".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Unlimited
        );
    }
    #[test]
    fn test_display_unlimited() {
        assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited");
    }
    #[test]
    fn test_parse_display_roundtrip() {
        let cases = vec![
            "50%",
            "100%",
            "1%",
            "2GB",
            "512MB",
            "unlimited",
            "UNLIMITED",
            "0",  // normalized to unlimited
            "0%", // normalized to unlimited
        ];
        for input in cases {
            let parsed = input.parse::<MemoryLimit>().unwrap();
            let displayed = parsed.to_string();
            let reparsed = displayed.parse::<MemoryLimit>().unwrap();
            assert_eq!(
                parsed, reparsed,
                "round-trip failed: '{}' -> '{}' -> '{:?}'",
                input, displayed, reparsed
            );
        }
    }
    #[test]
    fn test_zero_normalization() {
        // All forms of zero should normalize to Unlimited
        assert_eq!("0".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
        assert_eq!("0%".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
        assert_eq!("0B".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
        assert_eq!(
            "0KB".parse::<MemoryLimit>().unwrap(),
            MemoryLimit::Unlimited
        );
        // Unlimited always displays as "unlimited"
        assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited");
    }
 }
--- a/src/common/error/src/lib.rs
+++ b/src/common/error/src/lib.rs
@@ -45,19 +45,3 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap {
    header.insert(GREPTIME_DB_HEADER_ERROR_MSG, msg);
    header
 }
 /// Returns the external root cause of the source error (exclude the current error).
 pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> {
    // There are some divergence about the behavior of the `sources()` API
    // in https://github.com/rust-lang/rust/issues/58520
    // So this function iterates the sources manually.
    let mut root = err.source();
    while let Some(r) = root {
        if let Some(s) = r.source() {
            root = Some(s);
        } else {
            break;
        }
    }
    root
 }
--- a/src/common/frontend/src/selector.rs
+++ b/src/common/frontend/src/selector.rs
@@ -104,7 +104,7 @@ impl MetaClientSelector {
        let cfg = ChannelConfig::new()
            .connect_timeout(Duration::from_secs(30))
            .timeout(Duration::from_secs(30));
-        let channel_manager = ChannelManager::with_config(cfg, None);
+        let channel_manager = ChannelManager::with_config(cfg);
        Self {
            meta_client,
            channel_manager,
--- a/src/common/function/src/aggrs/vector.rs
+++ b/src/common/function/src/aggrs/vector.rs
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use crate::aggrs::vector::avg::VectorAvg;
 use crate::aggrs::vector::product::VectorProduct;
 use crate::aggrs::vector::sum::VectorSum;
 use crate::function_registry::FunctionRegistry;
 mod avg;
 mod product;
 mod sum;
@@ -27,6 +25,5 @@ impl VectorFunction {
    pub fn register(registry: &FunctionRegistry) {
        registry.register_aggr(VectorSum::uadf_impl());
        registry.register_aggr(VectorProduct::uadf_impl());
        registry.register_aggr(VectorAvg::uadf_impl());
    }
 }
--- a/src/common/function/src/aggrs/vector/avg.rs
+++ b/src/common/function/src/aggrs/vector/avg.rs
@@ -1,270 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::borrow::Cow;
 use std::sync::Arc;
 use arrow::array::{Array, ArrayRef, AsArray, BinaryArray, LargeStringArray, StringArray};
 use arrow::compute::sum;
 use arrow::datatypes::UInt64Type;
 use arrow_schema::{DataType, Field};
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
    Accumulator, AggregateUDF, Signature, SimpleAggregateUDF, TypeSignature, Volatility,
 };
 use datafusion_functions_aggregate_common::accumulator::AccumulatorArgs;
 use nalgebra::{Const, DVector, DVectorView, Dyn, OVector};
 use crate::scalars::vector::impl_conv::{
    binlit_as_veclit, parse_veclit_from_strlit, veclit_to_binlit,
 };
 /// The accumulator for the `vec_avg` aggregate function.
 #[derive(Debug, Default)]
 pub struct VectorAvg {
    sum: Option<OVector<f32, Dyn>>,
    count: u64,
 }
 impl VectorAvg {
    /// Create a new `AggregateUDF` for the `vec_avg` aggregate function.
    pub fn uadf_impl() -> AggregateUDF {
        let signature = Signature::one_of(
            vec![
                TypeSignature::Exact(vec![DataType::Utf8]),
                TypeSignature::Exact(vec![DataType::LargeUtf8]),
                TypeSignature::Exact(vec![DataType::Binary]),
            ],
            Volatility::Immutable,
        );
        let udaf = SimpleAggregateUDF::new_with_signature(
            "vec_avg",
            signature,
            DataType::Binary,
            Arc::new(Self::accumulator),
            vec![
                Arc::new(Field::new("sum", DataType::Binary, true)),
                Arc::new(Field::new("count", DataType::UInt64, true)),
            ],
        );
        AggregateUDF::from(udaf)
    }
    fn accumulator(args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
        if args.schema.fields().len() != 1 {
            return Err(datafusion_common::DataFusionError::Internal(format!(
                "expect creating `VEC_AVG` with only one input field, actual {}",
                args.schema.fields().len()
            )));
        }
        let t = args.schema.field(0).data_type();
        if !matches!(t, DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary) {
            return Err(datafusion_common::DataFusionError::Internal(format!(
                "unexpected input datatype {t} when creating `VEC_AVG`"
            )));
        }
        Ok(Box::new(VectorAvg::default()))
    }
    fn inner(&mut self, len: usize) -> &mut OVector<f32, Dyn> {
        self.sum
            .get_or_insert_with(|| OVector::zeros_generic(Dyn(len), Const::<1>))
    }
    fn update(&mut self, values: &[ArrayRef], is_update: bool) -> Result<()> {
        if values.is_empty() {
            return Ok(());
        };
        let vectors = match values[0].data_type() {
            DataType::Utf8 => {
                let arr: &StringArray = values[0].as_string();
                arr.iter()
                    .filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into)))
                    .map(|x| x.map(Cow::Owned))
                    .collect::<Result<Vec<_>>>()?
            }
            DataType::LargeUtf8 => {
                let arr: &LargeStringArray = values[0].as_string();
                arr.iter()
                    .filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into)))
                    .map(|x: Result<Vec<f32>>| x.map(Cow::Owned))
                    .collect::<Result<Vec<_>>>()?
            }
            DataType::Binary => {
                let arr: &BinaryArray = values[0].as_binary();
                arr.iter()
                    .filter_map(|x| x.map(|b| binlit_as_veclit(b).map_err(Into::into)))
                    .collect::<Result<Vec<_>>>()?
            }
            _ => {
                return Err(datafusion_common::DataFusionError::NotImplemented(format!(
                    "unsupported data type {} for `VEC_AVG`",
                    values[0].data_type()
                )));
            }
        };
        if vectors.is_empty() {
            return Ok(());
        }
        let len = if is_update {
            vectors.len() as u64
        } else {
            sum(values[1].as_primitive::<UInt64Type>()).unwrap_or_default()
        };
        let dims = vectors[0].len();
        let mut sum = DVector::zeros(dims);
        for v in vectors {
            if v.len() != dims {
                return Err(datafusion_common::DataFusionError::Execution(
                    "vectors length not match: VEC_AVG".to_string(),
                ));
            }
            let v_view = DVectorView::from_slice(&v, dims);
            sum += &v_view;
        }
        *self.inner(dims) += sum;
        self.count += len;
        Ok(())
    }
 }
 impl Accumulator for VectorAvg {
    fn state(&mut self) -> Result<Vec<ScalarValue>> {
        let vector = match &self.sum {
            None => ScalarValue::Binary(None),
            Some(sum) => ScalarValue::Binary(Some(veclit_to_binlit(sum.as_slice()))),
        };
        Ok(vec![vector, ScalarValue::from(self.count)])
    }
    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
        self.update(values, true)
    }
    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
        self.update(states, false)
    }
    fn evaluate(&mut self) -> Result<ScalarValue> {
        match &self.sum {
            None => Ok(ScalarValue::Binary(None)),
            Some(sum) => Ok(ScalarValue::Binary(Some(veclit_to_binlit(
                (sum / self.count as f32).as_slice(),
            )))),
        }
    }
    fn size(&self) -> usize {
        size_of_val(self)
    }
 }
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
    use arrow::array::StringArray;
    use datatypes::scalars::ScalarVector;
    use datatypes::vectors::{ConstantVector, StringVector, Vector};
    use super::*;
    #[test]
    fn test_update_batch() {
        // test update empty batch, expect not updating anything
        let mut vec_avg = VectorAvg::default();
        vec_avg.update_batch(&[]).unwrap();
        assert!(vec_avg.sum.is_none());
        assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap());
        // test update one not-null value
        let mut vec_avg = VectorAvg::default();
        let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
            Some("[1.0,2.0,3.0]".to_string()),
            Some("[4.0,5.0,6.0]".to_string()),
        ]))];
        vec_avg.update_batch(&v).unwrap();
        assert_eq!(
            ScalarValue::Binary(Some(veclit_to_binlit(&[2.5, 3.5, 4.5]))),
            vec_avg.evaluate().unwrap()
        );
        // test update one null value
        let mut vec_avg = VectorAvg::default();
        let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![Option::<String>::None]))];
        vec_avg.update_batch(&v).unwrap();
        assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap());
        // test update no null-value batch
        let mut vec_avg = VectorAvg::default();
        let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
            Some("[1.0,2.0,3.0]".to_string()),
            Some("[4.0,5.0,6.0]".to_string()),
            Some("[7.0,8.0,9.0]".to_string()),
        ]))];
        vec_avg.update_batch(&v).unwrap();
        assert_eq!(
            ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))),
            vec_avg.evaluate().unwrap()
        );
        // test update null-value batch
        let mut vec_avg = VectorAvg::default();
        let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
            Some("[1.0,2.0,3.0]".to_string()),
            None,
            Some("[7.0,8.0,9.0]".to_string()),
        ]))];
        vec_avg.update_batch(&v).unwrap();
        assert_eq!(
            ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))),
            vec_avg.evaluate().unwrap()
        );
        let mut vec_avg = VectorAvg::default();
        let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
            None,
            Some("[4.0,5.0,6.0]".to_string()),
            Some("[7.0,8.0,9.0]".to_string()),
        ]))];
        vec_avg.update_batch(&v).unwrap();
        assert_eq!(
            ScalarValue::Binary(Some(veclit_to_binlit(&[5.5, 6.5, 7.5]))),
            vec_avg.evaluate().unwrap()
        );
        // test update with constant vector
        let mut vec_avg = VectorAvg::default();
        let v: Vec<ArrayRef> = vec![
            Arc::new(ConstantVector::new(
                Arc::new(StringVector::from_vec(vec!["[1.0,2.0,3.0]".to_string()])),
                4,
            ))
            .to_arrow_array(),
        ];
        vec_avg.update_batch(&v).unwrap();
        assert_eq!(
            ScalarValue::Binary(Some(veclit_to_binlit(&[1.0, 2.0, 3.0]))),
            vec_avg.evaluate().unwrap()
        );
    }
 }
--- a/src/common/function/src/scalars/vector.rs
+++ b/src/common/function/src/scalars/vector.rs
@@ -14,7 +14,6 @@
 mod convert;
 mod distance;
 mod elem_avg;
 mod elem_product;
 mod elem_sum;
 pub mod impl_conv;
@@ -65,7 +64,6 @@ impl VectorFunction {
        registry.register_scalar(vector_subvector::VectorSubvectorFunction::default());
        registry.register_scalar(elem_sum::ElemSumFunction::default());
        registry.register_scalar(elem_product::ElemProductFunction::default());
        registry.register_scalar(elem_avg::ElemAvgFunction::default());
    }
 }
--- a/src/common/function/src/scalars/vector/elem_avg.rs
+++ b/src/common/function/src/scalars/vector/elem_avg.rs
@@ -1,128 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::fmt::Display;
 use datafusion::arrow::datatypes::DataType;
 use datafusion::logical_expr::ColumnarValue;
 use datafusion_common::ScalarValue;
 use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS};
 use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
 use nalgebra::DVectorView;
 use crate::function::Function;
 use crate::scalars::vector::{VectorCalculator, impl_conv};
 const NAME: &str = "vec_elem_avg";
 #[derive(Debug, Clone)]
 pub(crate) struct ElemAvgFunction {
    signature: Signature,
 }
 impl Default for ElemAvgFunction {
    fn default() -> Self {
        Self {
            signature: Signature::one_of(
                vec![
                    TypeSignature::Uniform(1, STRINGS.to_vec()),
                    TypeSignature::Uniform(1, BINARYS.to_vec()),
                    TypeSignature::Uniform(1, vec![DataType::BinaryView]),
                ],
                Volatility::Immutable,
            ),
        }
    }
 }
 impl Function for ElemAvgFunction {
    fn name(&self) -> &str {
        NAME
    }
    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
        Ok(DataType::Float32)
    }
    fn signature(&self) -> &Signature {
        &self.signature
    }
    fn invoke_with_args(
        &self,
        args: ScalarFunctionArgs,
    ) -> datafusion_common::Result<ColumnarValue> {
        let body = |v0: &ScalarValue| -> datafusion_common::Result<ScalarValue> {
            let v0 =
                impl_conv::as_veclit(v0)?.map(|v0| DVectorView::from_slice(&v0, v0.len()).mean());
            Ok(ScalarValue::Float32(v0))
        };
        let calculator = VectorCalculator {
            name: self.name(),
            func: body,
        };
        calculator.invoke_with_single_argument(args)
    }
 }
 impl Display for ElemAvgFunction {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", NAME.to_ascii_uppercase())
    }
 }
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
    use arrow::array::StringViewArray;
    use arrow_schema::Field;
    use datafusion::arrow::array::{Array, AsArray};
    use datafusion::arrow::datatypes::Float32Type;
    use datafusion_common::config::ConfigOptions;
    use super::*;
    #[test]
    fn test_elem_avg() {
        let func = ElemAvgFunction::default();
        let input = Arc::new(StringViewArray::from(vec![
            Some("[1.0,2.0,3.0]".to_string()),
            Some("[4.0,5.0,6.0]".to_string()),
            Some("[7.0,8.0,9.0]".to_string()),
            None,
        ]));
        let result = func
            .invoke_with_args(ScalarFunctionArgs {
                args: vec![ColumnarValue::Array(input.clone())],
                arg_fields: vec![],
                number_rows: input.len(),
                return_field: Arc::new(Field::new("x", DataType::Float32, true)),
                config_options: Arc::new(ConfigOptions::new()),
            })
            .and_then(|v| ColumnarValue::values_to_arrays(&[v]))
            .map(|mut a| a.remove(0))
            .unwrap();
        let result = result.as_primitive::<Float32Type>();
        assert_eq!(result.len(), 4);
        assert_eq!(result.value(0), 2.0);
        assert_eq!(result.value(1), 5.0);
        assert_eq!(result.value(2), 8.0);
        assert!(result.is_null(3));
    }
 }
--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -22,14 +22,14 @@ use dashmap::DashMap;
 use dashmap::mapref::entry::Entry;
 use lazy_static::lazy_static;
 use serde::{Deserialize, Serialize};
-use snafu::ResultExt;
+use snafu::{OptionExt, ResultExt};
 use tokio_util::sync::CancellationToken;
 use tonic::transport::{
    Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri,
 };
 use tower::Service;
-use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, Result};
+use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, InvalidTlsConfigSnafu, Result};
 const RECYCLE_CHANNEL_INTERVAL_SECS: u64 = 60;
 pub const DEFAULT_GRPC_REQUEST_TIMEOUT_SECS: u64 = 10;
@@ -91,18 +91,57 @@ impl ChannelManager {
        Default::default()
    }
-    /// unified with config function that support tls config
+    pub fn with_config(config: ChannelConfig) -> Self {
-    /// use [`load_tls_config`] to load tls config from file system
+        let inner = Inner::with_config(config);
    pub fn with_config(config: ChannelConfig, tls_config: Option<ClientTlsConfig>) -> Self {
        let mut inner = Inner::with_config(config.clone());
        if let Some(tls_config) = tls_config {
            inner.client_tls_config = Some(tls_config);
        }
        Self {
            inner: Arc::new(inner),
        }
    }
    /// Read tls cert and key files and create a ChannelManager with TLS config.
    pub fn with_tls_config(config: ChannelConfig) -> Result<Self> {
        let mut inner = Inner::with_config(config.clone());
        // setup tls
        let path_config = config.client_tls.context(InvalidTlsConfigSnafu {
            msg: "no config input",
        })?;
        if !path_config.enabled {
            // if TLS not enabled, just ignore other tls config
            // and not set `client_tls_config` hence not use TLS
            return Ok(Self {
                inner: Arc::new(inner),
            });
        }
        let mut tls_config = ClientTlsConfig::new();
        if let Some(server_ca) = path_config.server_ca_cert_path {
            let server_root_ca_cert =
                std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?;
            let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert);
            tls_config = tls_config.ca_certificate(server_root_ca_cert);
        }
        if let (Some(client_cert_path), Some(client_key_path)) =
            (&path_config.client_cert_path, &path_config.client_key_path)
        {
            let client_cert =
                std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?;
            let client_key =
                std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?;
            let client_identity = Identity::from_pem(client_cert, client_key);
            tls_config = tls_config.identity(client_identity);
        }
        inner.client_tls_config = Some(tls_config);
        Ok(Self {
            inner: Arc::new(inner),
        })
    }
    pub fn config(&self) -> &ChannelConfig {
        &self.inner.config
    }
@@ -248,34 +287,6 @@ impl ChannelManager {
    }
 }
 pub fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result<Option<ClientTlsConfig>> {
    let path_config = match tls_option {
        Some(path_config) if path_config.enabled => path_config,
        _ => return Ok(None),
    };
    let mut tls_config = ClientTlsConfig::new();
    if let Some(server_ca) = &path_config.server_ca_cert_path {
        let server_root_ca_cert =
            std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?;
        let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert);
        tls_config = tls_config.ca_certificate(server_root_ca_cert);
    }
    if let (Some(client_cert_path), Some(client_key_path)) =
        (&path_config.client_cert_path, &path_config.client_key_path)
    {
        let client_cert =
            std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?;
        let client_key =
            std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?;
        let client_identity = Identity::from_pem(client_cert, client_key);
        tls_config = tls_config.identity(client_identity);
    }
    Ok(Some(tls_config))
 }
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ClientTlsOption {
    /// Whether to enable TLS for client.
@@ -648,7 +659,7 @@ mod tests {
            .http2_adaptive_window(true)
            .tcp_keepalive(Duration::from_secs(2))
            .tcp_nodelay(true);
-        let mgr = ChannelManager::with_config(config, None);
+        let mgr = ChannelManager::with_config(config);
        let res = mgr.build_endpoint("test_addr");
--- a/src/common/grpc/tests/mod.rs
+++ b/src/common/grpc/tests/mod.rs
@@ -12,17 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use common_grpc::channel_manager::{
+use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption};
    ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
 };
 #[tokio::test]
 async fn test_mtls_config() {
    // test no config
    let config = ChannelConfig::new();
-    let re = load_tls_config(config.client_tls.as_ref());
+    let re = ChannelManager::with_tls_config(config);
-    assert!(re.is_ok());
+    assert!(re.is_err());
    assert!(re.unwrap().is_none());
    // test wrong file
    let config = ChannelConfig::new().client_tls_config(ClientTlsOption {
@@ -32,7 +29,7 @@ async fn test_mtls_config() {
        client_key_path: Some("tests/tls/wrong_client.key".to_string()),
    });
-    let re = load_tls_config(config.client_tls.as_ref());
+    let re = ChannelManager::with_tls_config(config);
    assert!(re.is_err());
    // test corrupted file content
@@ -43,9 +40,7 @@ async fn test_mtls_config() {
        client_key_path: Some("tests/tls/corrupted".to_string()),
    });
-    let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap();
+    let re = ChannelManager::with_tls_config(config).unwrap();
    let re = ChannelManager::with_config(config, tls_config);
    let re = re.get("127.0.0.1:0");
    assert!(re.is_err());
@@ -57,8 +52,7 @@ async fn test_mtls_config() {
        client_key_path: Some("tests/tls/client.key".to_string()),
    });
-    let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap();
+    let re = ChannelManager::with_tls_config(config).unwrap();
    let re = ChannelManager::with_config(config, tls_config);
    let re = re.get("127.0.0.1:0");
    let _ = re.unwrap();
 }
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -77,10 +77,7 @@ serde_json.workspace = true
 serde_with.workspace = true
 session.workspace = true
 snafu.workspace = true
-sqlx = { workspace = true, features = [
+sqlx = { workspace = true, optional = true }
    "mysql",
    "chrono",
 ], optional = true }
 store-api.workspace = true
 strum.workspace = true
 table = { workspace = true, features = ["testing"] }
--- a/src/common/meta/src/ddl/utils.rs
+++ b/src/common/meta/src/ddl/utils.rs
@@ -442,7 +442,7 @@ pub fn extract_column_metadatas(
    results: &mut [RegionResponse],
    key: &str,
 ) -> Result<Option<Vec<ColumnMetadata>>> {
-    let mut schemas = results
+    let schemas = results
        .iter_mut()
        .map(|r| r.extensions.remove(key))
        .collect::<Vec<_>>();
@@ -454,24 +454,20 @@ pub fn extract_column_metadatas(
    // Verify all the physical schemas are the same
    // Safety: previous check ensures this vec is not empty
-    let first_column_metadatas = schemas
+    let first = schemas.first().unwrap();
-        .swap_remove(0)
+    ensure!(
-        .map(|first_bytes| ColumnMetadata::decode_list(&first_bytes).context(DecodeJsonSnafu))
+        schemas.iter().all(|x| x == first),
-        .transpose()?;
+        MetadataCorruptionSnafu {
            err_msg: "The table column metadata schemas from datanodes are not the same."
        }
    );
-    for s in schemas {
+    if let Some(first) = first {
-        // check decoded column metadata instead of bytes because it contains extension map.
+        let column_metadatas = ColumnMetadata::decode_list(first).context(DecodeJsonSnafu)?;
-        let column_metadata = s
+        Ok(Some(column_metadatas))
-            .map(|bytes| ColumnMetadata::decode_list(&bytes).context(DecodeJsonSnafu))
+    } else {
-            .transpose()?;
+        Ok(None)
        ensure!(
            column_metadata == first_column_metadatas,
            MetadataCorruptionSnafu {
                err_msg: "The table column metadata schemas from datanodes are not the same."
            }
        );
    }
    Ok(first_column_metadatas)
 }
 #[cfg(test)]
--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -250,7 +250,7 @@ pub struct UpgradeRegion {
    /// `None` stands for no wait,
    /// it's helpful to verify whether the leader region is ready.
    #[serde(with = "humantime_serde")]
-    pub replay_timeout: Duration,
+    pub replay_timeout: Option<Duration>,
    /// The hint for replaying memtable.
    #[serde(default)]
    pub location_id: Option<u64>,
@@ -507,14 +507,13 @@ pub enum Instruction {
    /// Closes regions.
    #[serde(deserialize_with = "single_or_multiple_from", alias = "CloseRegion")]
    CloseRegions(Vec<RegionIdent>),
-    /// Upgrades regions.
+    /// Upgrades a region.
-    #[serde(deserialize_with = "single_or_multiple_from", alias = "UpgradeRegion")]
+    UpgradeRegion(UpgradeRegion),
    UpgradeRegions(Vec<UpgradeRegion>),
    #[serde(
        deserialize_with = "single_or_multiple_from",
        alias = "DowngradeRegion"
    )]
-    /// Downgrades regions.
+    /// Downgrades a region.
    DowngradeRegions(Vec<DowngradeRegion>),
    /// Invalidates batch cache.
    InvalidateCaches(Vec<CacheIdent>),
@@ -560,9 +559,9 @@ impl Instruction {
    }
    /// Converts the instruction into a [UpgradeRegion].
-    pub fn into_upgrade_regions(self) -> Option<Vec<UpgradeRegion>> {
+    pub fn into_upgrade_regions(self) -> Option<UpgradeRegion> {
        match self {
-            Self::UpgradeRegions(upgrade_region) => Some(upgrade_region),
+            Self::UpgradeRegion(upgrade_region) => Some(upgrade_region),
            _ => None,
        }
    }
@@ -585,10 +584,6 @@ impl Instruction {
 /// The reply of [UpgradeRegion].
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
 pub struct UpgradeRegionReply {
    /// The [RegionId].
    /// For compatibility, it is defaulted to [RegionId::new(0, 0)].
    #[serde(default)]
    pub region_id: RegionId,
    /// Returns true if `last_entry_id` has been replayed to the latest.
    pub ready: bool,
    /// Indicates whether the region exists.
@@ -640,39 +635,6 @@ where
    })
 }
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
 pub struct UpgradeRegionsReply {
    pub replies: Vec<UpgradeRegionReply>,
 }
 impl UpgradeRegionsReply {
    pub fn new(replies: Vec<UpgradeRegionReply>) -> Self {
        Self { replies }
    }
    pub fn single(reply: UpgradeRegionReply) -> Self {
        Self::new(vec![reply])
    }
 }
 #[derive(Deserialize)]
 #[serde(untagged)]
 enum UpgradeRegionsCompat {
    Single(UpgradeRegionReply),
    Multiple(UpgradeRegionsReply),
 }
 fn upgrade_regions_compat_from<'de, D>(deserializer: D) -> Result<UpgradeRegionsReply, D::Error>
 where
    D: Deserializer<'de>,
 {
    let helper = UpgradeRegionsCompat::deserialize(deserializer)?;
    Ok(match helper {
        UpgradeRegionsCompat::Single(x) => UpgradeRegionsReply::new(vec![x]),
        UpgradeRegionsCompat::Multiple(reply) => reply,
    })
 }
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
 #[serde(tag = "type", rename_all = "snake_case")]
 pub enum InstructionReply {
@@ -680,11 +642,7 @@ pub enum InstructionReply {
    OpenRegions(SimpleReply),
    #[serde(alias = "close_region")]
    CloseRegions(SimpleReply),
-    #[serde(
+    UpgradeRegion(UpgradeRegionReply),
        deserialize_with = "upgrade_regions_compat_from",
        alias = "upgrade_region"
    )]
    UpgradeRegions(UpgradeRegionsReply),
    #[serde(
        alias = "downgrade_region",
        deserialize_with = "downgrade_regions_compat_from"
@@ -700,11 +658,9 @@ impl Display for InstructionReply {
        match self {
            Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply),
            Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply),
-            Self::UpgradeRegions(reply) => {
+            Self::UpgradeRegion(reply) => write!(f, "InstructionReply::UpgradeRegion({})", reply),
                write!(f, "InstructionReply::UpgradeRegions({:?})", reply.replies)
            }
            Self::DowngradeRegions(reply) => {
-                write!(f, "InstructionReply::DowngradeRegions({:?})", reply.replies)
+                write!(f, "InstructionReply::DowngradeRegions({:?})", reply)
            }
            Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply),
            Self::GetFileRefs(reply) => write!(f, "InstructionReply::GetFileRefs({})", reply),
@@ -729,9 +685,9 @@ impl InstructionReply {
        }
    }
-    pub fn expect_upgrade_regions_reply(self) -> Vec<UpgradeRegionReply> {
+    pub fn expect_upgrade_region_reply(self) -> UpgradeRegionReply {
        match self {
-            Self::UpgradeRegions(reply) => reply.replies,
+            Self::UpgradeRegion(reply) => reply,
            _ => panic!("Expected UpgradeRegion reply"),
        }
    }
@@ -793,58 +749,25 @@ mod tests {
            serialized
        );
-        let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion {
+        let downgrade_region = InstructionReply::DowngradeRegions(DowngradeRegionsReply::single(
-            region_id: RegionId::new(1024, 1),
+            DowngradeRegionReply {
            last_entry_id: None,
            metadata_last_entry_id: None,
            replay_timeout: Duration::from_millis(1000),
            location_id: None,
            replay_entry_id: None,
            metadata_replay_entry_id: None,
        }]);
        let serialized = serde_json::to_string(&upgrade_region).unwrap();
        assert_eq!(
            r#"{"UpgradeRegions":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null}]}"#,
            serialized
        );
    }
    #[test]
    fn test_serialize_instruction_reply() {
        let downgrade_region_reply = InstructionReply::DowngradeRegions(
            DowngradeRegionsReply::single(DowngradeRegionReply {
                region_id: RegionId::new(1024, 1),
                last_entry_id: None,
                metadata_last_entry_id: None,
                exists: true,
                error: None,
-            }),
+            },
-        );
+        ));
-        let serialized = serde_json::to_string(&downgrade_region_reply).unwrap();
+        let serialized = serde_json::to_string(&downgrade_region).unwrap();
        assert_eq!(
            r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#,
            serialized
-        );
+        )
        let upgrade_region_reply =
            InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply {
                region_id: RegionId::new(1024, 1),
                ready: true,
                exists: true,
                error: None,
            }));
        let serialized = serde_json::to_string(&upgrade_region_reply).unwrap();
        assert_eq!(
            r#"{"type":"upgrade_regions","replies":[{"region_id":4398046511105,"ready":true,"exists":true,"error":null}]}"#,
            serialized
        );
    }
    #[test]
    fn test_deserialize_instruction() {
        // legacy open region instruction
        let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#;
        let open_region_instruction: Instruction =
            serde_json::from_str(open_region_instruction).unwrap();
@@ -862,7 +785,6 @@ mod tests {
        )]);
        assert_eq!(open_region_instruction, open_region);
        // legacy close region instruction
        let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#;
        let close_region_instruction: Instruction =
            serde_json::from_str(close_region_instruction).unwrap();
@@ -874,7 +796,6 @@ mod tests {
        }]);
        assert_eq!(close_region_instruction, close_region);
        // legacy downgrade region instruction
        let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#;
        let downgrade_region_instruction: Instruction =
            serde_json::from_str(downgrade_region_instruction).unwrap();
@@ -884,25 +805,6 @@ mod tests {
        }]);
        assert_eq!(downgrade_region_instruction, downgrade_region);
        // legacy upgrade region instruction
        let upgrade_region_instruction = r#"{"UpgradeRegion":{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null,"replay_entry_id":null,"metadata_replay_entry_id":null}}"#;
        let upgrade_region_instruction: Instruction =
            serde_json::from_str(upgrade_region_instruction).unwrap();
        let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion {
            region_id: RegionId::new(1024, 1),
            last_entry_id: None,
            metadata_last_entry_id: None,
            replay_timeout: Duration::from_millis(1000),
            location_id: None,
            replay_entry_id: None,
            metadata_replay_entry_id: None,
        }]);
        assert_eq!(upgrade_region_instruction, upgrade_region);
    }
    #[test]
    fn test_deserialize_instruction_reply() {
        // legacy close region reply
        let close_region_instruction_reply =
            r#"{"result":true,"error":null,"type":"close_region"}"#;
        let close_region_instruction_reply: InstructionReply =
@@ -913,7 +815,6 @@ mod tests {
        });
        assert_eq!(close_region_instruction_reply, close_region_reply);
        // legacy open region reply
        let open_region_instruction_reply = r#"{"result":true,"error":null,"type":"open_region"}"#;
        let open_region_instruction_reply: InstructionReply =
            serde_json::from_str(open_region_instruction_reply).unwrap();
@@ -923,7 +824,6 @@ mod tests {
        });
        assert_eq!(open_region_instruction_reply, open_region_reply);
        // legacy downgrade region reply
        let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#;
        let downgrade_region_instruction_reply: InstructionReply =
            serde_json::from_str(downgrade_region_instruction_reply).unwrap();
@@ -937,19 +837,6 @@ mod tests {
            }),
        );
        assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply);
        // legacy upgrade region reply
        let upgrade_region_instruction_reply = r#"{"region_id":4398046511105,"ready":true,"exists":true,"error":null,"type":"upgrade_region"}"#;
        let upgrade_region_instruction_reply: InstructionReply =
            serde_json::from_str(upgrade_region_instruction_reply).unwrap();
        let upgrade_region_reply =
            InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply {
                region_id: RegionId::new(1024, 1),
                ready: true,
                exists: true,
                error: None,
            }));
        assert_eq!(upgrade_region_instruction_reply, upgrade_region_reply);
    }
    #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/src/common/recordbatch/benches/iter_record_batch_rows.rs
+++ b/src/common/recordbatch/benches/iter_record_batch_rows.rs
@@ -26,6 +26,7 @@ use datatypes::arrow::datatypes::{
    Int32Type, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
    TimestampSecondType,
 };
 use datatypes::schema::SchemaRef;
 fn prepare_record_batch(rows: usize) -> RecordBatch {
    let schema = Schema::new(vec![
@@ -55,6 +56,14 @@ fn prepare_record_batch(rows: usize) -> RecordBatch {
    RecordBatch::try_new(Arc::new(schema), columns).unwrap()
 }
 fn iter_by_greptimedb_values(schema: SchemaRef, record_batch: RecordBatch) {
    let record_batch =
        common_recordbatch::RecordBatch::try_from_df_record_batch(schema, record_batch).unwrap();
    for row in record_batch.rows() {
        black_box(row);
    }
 }
 fn iter_by_loop_rows_and_columns(record_batch: RecordBatch) {
    for i in 0..record_batch.num_rows() {
        for column in record_batch.columns() {
@@ -116,6 +125,19 @@ pub fn criterion_benchmark(c: &mut Criterion) {
    let mut group = c.benchmark_group("iter_record_batch");
    for rows in [1usize, 10, 100, 1_000, 10_000] {
        group.bench_with_input(
            BenchmarkId::new("by_greptimedb_values", rows),
            &rows,
            |b, rows| {
                let record_batch = prepare_record_batch(*rows);
                let schema =
                    Arc::new(datatypes::schema::Schema::try_from(record_batch.schema()).unwrap());
                b.iter(|| {
                    iter_by_greptimedb_values(schema.clone(), record_batch.clone());
                })
            },
        );
        group.bench_with_input(
            BenchmarkId::new("by_loop_rows_and_columns", rows),
            &rows,
--- a/src/common/recordbatch/src/error.rs
+++ b/src/common/recordbatch/src/error.rs
@@ -193,13 +193,6 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
    #[snafu(display("Exceeded memory limit: {}", msg))]
    ExceedMemoryLimit {
        msg: String,
        #[snafu(implicit)]
        location: Location,
    },
 }
 impl ErrorExt for Error {
@@ -236,8 +229,6 @@ impl ErrorExt for Error {
            Error::StreamTimeout { .. } => StatusCode::Cancelled,
            Error::StreamCancelled { .. } => StatusCode::Cancelled,
            Error::ExceedMemoryLimit { .. } => StatusCode::RuntimeResourcesExhausted,
        }
    }
--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -21,14 +21,11 @@ pub mod filter;
 mod recordbatch;
 pub mod util;
 use std::fmt;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use adapter::RecordBatchMetrics;
 use arc_swap::ArcSwapOption;
 use common_base::readable_size::ReadableSize;
 pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
 use datatypes::arrow::compute::SortOptions;
 pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
@@ -409,399 +406,6 @@ impl<S: Stream<Item = Result<RecordBatch>> + Unpin> Stream for RecordBatchStream
    }
 }
 /// Memory permit for a stream, providing privileged access or rate limiting.
 ///
 /// The permit tracks whether this stream has privileged Top-K status.
 /// When dropped, it automatically releases any privileged slot it holds.
 pub struct MemoryPermit {
    tracker: QueryMemoryTracker,
    is_privileged: AtomicBool,
 }
 impl MemoryPermit {
    /// Check if this permit currently has privileged status.
    pub fn is_privileged(&self) -> bool {
        self.is_privileged.load(Ordering::Acquire)
    }
    /// Ensure this permit has privileged status by acquiring a slot if available.
    /// Returns true if privileged (either already privileged or just acquired privilege).
    fn ensure_privileged(&self) -> bool {
        if self.is_privileged.load(Ordering::Acquire) {
            return true;
        }
        // Try to claim a privileged slot
        self.tracker
            .privileged_count
            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| {
                if count < self.tracker.privileged_slots {
                    Some(count + 1)
                } else {
                    None
                }
            })
            .map(|_| {
                self.is_privileged.store(true, Ordering::Release);
                true
            })
            .unwrap_or(false)
    }
    /// Track additional memory usage with this permit.
    /// Returns error if limit is exceeded.
    ///
    /// # Arguments
    /// * `additional` - Additional memory size to track in bytes
    /// * `stream_tracked` - Total memory already tracked by this stream
    ///
    /// # Behavior
    /// - Privileged streams: Can push global memory usage up to full limit
    /// - Standard-tier streams: Can push global memory usage up to limit * standard_tier_memory_fraction (default: 0.7)
    /// - Standard-tier streams automatically attempt to acquire privilege if slots become available
    /// - The configured limit is absolute hard limit - no stream can exceed it
    pub fn track(&self, additional: usize, stream_tracked: usize) -> Result<()> {
        // Ensure privileged status if possible
        let is_privileged = self.ensure_privileged();
        self.tracker
            .track_internal(additional, is_privileged, stream_tracked)
    }
    /// Release tracked memory.
    ///
    /// # Arguments
    /// * `amount` - Amount of memory to release in bytes
    pub fn release(&self, amount: usize) {
        self.tracker.release(amount);
    }
 }
 impl Drop for MemoryPermit {
    fn drop(&mut self) {
        // Release privileged slot if we had one
        if self.is_privileged.load(Ordering::Acquire) {
            self.tracker
                .privileged_count
                .fetch_sub(1, Ordering::Release);
        }
    }
 }
 /// Memory tracker for RecordBatch streams. Clone to share the same limit across queries.
 ///
 /// Implements a two-tier memory allocation strategy:
 /// - **Privileged tier**: First N streams (default: 20) can use up to the full memory limit
 /// - **Standard tier**: Remaining streams are restricted to a fraction of the limit (default: 70%)
 /// - Privilege is granted on a first-come-first-served basis
 /// - The configured limit is an absolute hard cap - no stream can exceed it
 #[derive(Clone)]
 pub struct QueryMemoryTracker {
    current: Arc<AtomicUsize>,
    limit: usize,
    standard_tier_memory_fraction: f64,
    privileged_count: Arc<AtomicUsize>,
    privileged_slots: usize,
    on_update: Option<Arc<dyn Fn(usize) + Send + Sync>>,
    on_reject: Option<Arc<dyn Fn() + Send + Sync>>,
 }
 impl fmt::Debug for QueryMemoryTracker {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("QueryMemoryTracker")
            .field("current", &self.current.load(Ordering::Acquire))
            .field("limit", &self.limit)
            .field(
                "standard_tier_memory_fraction",
                &self.standard_tier_memory_fraction,
            )
            .field(
                "privileged_count",
                &self.privileged_count.load(Ordering::Acquire),
            )
            .field("privileged_slots", &self.privileged_slots)
            .field("on_update", &self.on_update.is_some())
            .field("on_reject", &self.on_reject.is_some())
            .finish()
    }
 }
 impl QueryMemoryTracker {
    // Default privileged slots when max_concurrent_queries is 0.
    const DEFAULT_PRIVILEGED_SLOTS: usize = 20;
    // Ratio for privileged tier: 70% queries get privileged access, standard tier uses 70% memory.
    const DEFAULT_PRIVILEGED_TIER_RATIO: f64 = 0.7;
    /// Create a new memory tracker with the given limit and max_concurrent_queries.
    /// Calculates privileged slots as 70% of max_concurrent_queries (or 20 if max_concurrent_queries is 0).
    ///
    /// # Arguments
    /// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited.
    /// * `max_concurrent_queries` - Maximum number of concurrent queries (0 = unlimited).
    pub fn new(limit: usize, max_concurrent_queries: usize) -> Self {
        let privileged_slots = Self::calculate_privileged_slots(max_concurrent_queries);
        Self::with_privileged_slots(limit, privileged_slots)
    }
    /// Create a new memory tracker with custom privileged slots limit.
    pub fn with_privileged_slots(limit: usize, privileged_slots: usize) -> Self {
        Self::with_config(limit, privileged_slots, Self::DEFAULT_PRIVILEGED_TIER_RATIO)
    }
    /// Create a new memory tracker with full configuration.
    ///
    /// # Arguments
    /// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited.
    /// * `privileged_slots` - Maximum number of streams that can get privileged status.
    /// * `standard_tier_memory_fraction` - Memory fraction for standard-tier streams (range: [0.0, 1.0]).
    ///
    /// # Panics
    /// Panics if `standard_tier_memory_fraction` is not in the range [0.0, 1.0].
    pub fn with_config(
        limit: usize,
        privileged_slots: usize,
        standard_tier_memory_fraction: f64,
    ) -> Self {
        assert!(
            (0.0..=1.0).contains(&standard_tier_memory_fraction),
            "standard_tier_memory_fraction must be in [0.0, 1.0], got {}",
            standard_tier_memory_fraction
        );
        Self {
            current: Arc::new(AtomicUsize::new(0)),
            limit,
            standard_tier_memory_fraction,
            privileged_count: Arc::new(AtomicUsize::new(0)),
            privileged_slots,
            on_update: None,
            on_reject: None,
        }
    }
    /// Register a new permit for memory tracking.
    /// The first `privileged_slots` permits get privileged status automatically.
    /// The returned permit can be shared across multiple streams of the same query.
    pub fn register_permit(&self) -> MemoryPermit {
        // Try to claim a privileged slot
        let is_privileged = self
            .privileged_count
            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| {
                if count < self.privileged_slots {
                    Some(count + 1)
                } else {
                    None
                }
            })
            .is_ok();
        MemoryPermit {
            tracker: self.clone(),
            is_privileged: AtomicBool::new(is_privileged),
        }
    }
    /// Set a callback to be called whenever the usage changes successfully.
    /// The callback receives the new total usage in bytes.
    ///
    /// # Note
    /// The callback is called after both successful `track()` and `release()` operations.
    /// It is called even when `limit == 0` (unlimited mode) to track actual usage.
    pub fn with_on_update<F>(mut self, on_update: F) -> Self
    where
        F: Fn(usize) + Send + Sync + 'static,
    {
        self.on_update = Some(Arc::new(on_update));
        self
    }
    /// Set a callback to be called when memory allocation is rejected.
    ///
    /// # Note
    /// This is only called when `track()` fails due to exceeding the limit.
    /// It is never called when `limit == 0` (unlimited mode).
    pub fn with_on_reject<F>(mut self, on_reject: F) -> Self
    where
        F: Fn() + Send + Sync + 'static,
    {
        self.on_reject = Some(Arc::new(on_reject));
        self
    }
    /// Get the current memory usage in bytes.
    pub fn current(&self) -> usize {
        self.current.load(Ordering::Acquire)
    }
    fn calculate_privileged_slots(max_concurrent_queries: usize) -> usize {
        if max_concurrent_queries == 0 {
            Self::DEFAULT_PRIVILEGED_SLOTS
        } else {
            ((max_concurrent_queries as f64 * Self::DEFAULT_PRIVILEGED_TIER_RATIO) as usize).max(1)
        }
    }
    /// Internal method to track additional memory usage.
    ///
    /// Called by `MemoryPermit::track()`. Use `MemoryPermit::track()` instead of calling this directly.
    fn track_internal(
        &self,
        additional: usize,
        is_privileged: bool,
        stream_tracked: usize,
    ) -> Result<()> {
        // Calculate effective global limit based on stream privilege
        // Privileged streams: can push global usage up to full limit
        // Standard-tier streams: can only push global usage up to fraction of limit
        let effective_limit = if is_privileged {
            self.limit
        } else {
            (self.limit as f64 * self.standard_tier_memory_fraction) as usize
        };
        let mut new_total = 0;
        let result = self
            .current
            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
                new_total = current.saturating_add(additional);
                if self.limit == 0 {
                    // Unlimited mode
                    return Some(new_total);
                }
                // Check if new global total exceeds effective limit
                // The configured limit is absolute hard limit - no stream can exceed it
                if new_total <= effective_limit {
                    Some(new_total)
                } else {
                    None
                }
            });
        match result {
            Ok(_) => {
                if let Some(callback) = &self.on_update {
                    callback(new_total);
                }
                Ok(())
            }
            Err(current) => {
                if let Some(callback) = &self.on_reject {
                    callback();
                }
                let msg = format!(
                    "{} requested, {} used globally ({}%), {} used by this stream (privileged: {}), effective limit: {} ({}%), hard limit: {}",
                    ReadableSize(additional as u64),
                    ReadableSize(current as u64),
                    if self.limit > 0 {
                        current * 100 / self.limit
                    } else {
                        0
                    },
                    ReadableSize(stream_tracked as u64),
                    is_privileged,
                    ReadableSize(effective_limit as u64),
                    if self.limit > 0 {
                        effective_limit * 100 / self.limit
                    } else {
                        0
                    },
                    ReadableSize(self.limit as u64)
                );
                error::ExceedMemoryLimitSnafu { msg }.fail()
            }
        }
    }
    /// Release tracked memory.
    ///
    /// # Arguments
    /// * `amount` - Amount of memory to release in bytes
    pub fn release(&self, amount: usize) {
        if let Ok(old_value) =
            self.current
                .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
                    Some(current.saturating_sub(amount))
                })
            && let Some(callback) = &self.on_update
        {
            callback(old_value.saturating_sub(amount));
        }
    }
 }
 /// A wrapper stream that tracks memory usage of RecordBatches.
 pub struct MemoryTrackedStream {
    inner: SendableRecordBatchStream,
    permit: Arc<MemoryPermit>,
    // Total tracked size, released when stream drops.
    total_tracked: usize,
 }
 impl MemoryTrackedStream {
    pub fn new(inner: SendableRecordBatchStream, permit: Arc<MemoryPermit>) -> Self {
        Self {
            inner,
            permit,
            total_tracked: 0,
        }
    }
 }
 impl Stream for MemoryTrackedStream {
    type Item = Result<RecordBatch>;
    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
        match Pin::new(&mut self.inner).poll_next(cx) {
            Poll::Ready(Some(Ok(batch))) => {
                let additional = batch
                    .columns()
                    .iter()
                    .map(|c| c.memory_size())
                    .sum::<usize>();
                if let Err(e) = self.permit.track(additional, self.total_tracked) {
                    return Poll::Ready(Some(Err(e)));
                }
                self.total_tracked += additional;
                Poll::Ready(Some(Ok(batch)))
            }
            Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
            Poll::Ready(None) => Poll::Ready(None),
            Poll::Pending => Poll::Pending,
        }
    }
    fn size_hint(&self) -> (usize, Option<usize>) {
        self.inner.size_hint()
    }
 }
 impl Drop for MemoryTrackedStream {
    fn drop(&mut self) {
        if self.total_tracked > 0 {
            self.permit.release(self.total_tracked);
        }
    }
 }
 impl RecordBatchStream for MemoryTrackedStream {
    fn schema(&self) -> SchemaRef {
        self.inner.schema()
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        self.inner.output_ordering()
    }
    fn metrics(&self) -> Option<RecordBatchMetrics> {
        self.inner.metrics()
    }
 }
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
@@ -892,157 +496,4 @@ mod tests {
        assert_eq!(collected[0], batch1);
        assert_eq!(collected[1], batch2);
    }
    #[test]
    fn test_query_memory_tracker_basic() {
        let tracker = Arc::new(QueryMemoryTracker::new(1000, 0));
        // Register first stream - should get privileged status
        let permit1 = tracker.register_permit();
        assert!(permit1.is_privileged());
        // Privileged stream can use up to limit
        assert!(permit1.track(500, 0).is_ok());
        assert_eq!(tracker.current(), 500);
        // Register second stream - also privileged
        let permit2 = tracker.register_permit();
        assert!(permit2.is_privileged());
        // Can add more but cannot exceed hard limit (1000)
        assert!(permit2.track(400, 0).is_ok());
        assert_eq!(tracker.current(), 900);
        permit1.release(500);
        permit2.release(400);
        assert_eq!(tracker.current(), 0);
    }
    #[test]
    fn test_query_memory_tracker_privileged_limit() {
        // Privileged slots = 2 for easy testing
        // Limit: 1000, standard-tier fraction: 0.7 (default)
        // Privileged can push global to 1000, standard-tier can push global to 700
        let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 2));
        // First 2 streams are privileged
        let permit1 = tracker.register_permit();
        let permit2 = tracker.register_permit();
        assert!(permit1.is_privileged());
        assert!(permit2.is_privileged());
        // Third stream is standard-tier (not privileged)
        let permit3 = tracker.register_permit();
        assert!(!permit3.is_privileged());
        // Privileged stream uses some memory
        assert!(permit1.track(300, 0).is_ok());
        assert_eq!(tracker.current(), 300);
        // Standard-tier can add up to 400 (total becomes 700, its effective limit)
        assert!(permit3.track(400, 0).is_ok());
        assert_eq!(tracker.current(), 700);
        // Standard-tier stream cannot push global beyond 700
        let err = permit3.track(100, 400).unwrap_err();
        let err_msg = err.to_string();
        assert!(err_msg.contains("400B used by this stream"));
        assert!(err_msg.contains("effective limit: 700B (70%)"));
        assert!(err_msg.contains("700B used globally (70%)"));
        assert_eq!(tracker.current(), 700);
        permit1.release(300);
        permit3.release(400);
        assert_eq!(tracker.current(), 0);
    }
    #[test]
    fn test_query_memory_tracker_promotion() {
        // Privileged slots = 1 for easy testing
        let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1));
        // First stream is privileged
        let permit1 = tracker.register_permit();
        assert!(permit1.is_privileged());
        // Second stream is standard-tier (can only use 500)
        let permit2 = tracker.register_permit();
        assert!(!permit2.is_privileged());
        // Standard-tier can only track 500
        assert!(permit2.track(400, 0).is_ok());
        assert_eq!(tracker.current(), 400);
        // Drop first permit to release privileged slot
        drop(permit1);
        // Second stream can now be promoted and use more memory
        assert!(permit2.track(500, 400).is_ok());
        assert!(permit2.is_privileged());
        assert_eq!(tracker.current(), 900);
        permit2.release(900);
        assert_eq!(tracker.current(), 0);
    }
    #[test]
    fn test_query_memory_tracker_privileged_hard_limit() {
        // Test that the configured limit is absolute hard limit for all streams
        // Privileged: can use full limit (1000)
        // Standard-tier: can use 0.7x limit (700 with defaults)
        let tracker = Arc::new(QueryMemoryTracker::new(1000, 0));
        let permit1 = tracker.register_permit();
        assert!(permit1.is_privileged());
        // Privileged can use up to full limit (1000)
        assert!(permit1.track(900, 0).is_ok());
        assert_eq!(tracker.current(), 900);
        // Privileged cannot exceed hard limit (1000)
        assert!(permit1.track(200, 900).is_err());
        assert_eq!(tracker.current(), 900);
        // Can add within hard limit
        assert!(permit1.track(100, 900).is_ok());
        assert_eq!(tracker.current(), 1000);
        // Cannot exceed even by 1 byte
        assert!(permit1.track(1, 1000).is_err());
        assert_eq!(tracker.current(), 1000);
        permit1.release(1000);
        assert_eq!(tracker.current(), 0);
    }
    #[test]
    fn test_query_memory_tracker_standard_tier_fraction() {
        // Test standard-tier streams use fraction of limit
        // Limit: 1000, default fraction: 0.7, so standard-tier can use 700
        let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1));
        let permit1 = tracker.register_permit();
        assert!(permit1.is_privileged());
        let permit2 = tracker.register_permit();
        assert!(!permit2.is_privileged());
        // Standard-tier can use up to 700 (1000 * 0.7 default)
        assert!(permit2.track(600, 0).is_ok());
        assert_eq!(tracker.current(), 600);
        // Cannot exceed standard-tier limit (700)
        assert!(permit2.track(200, 600).is_err());
        assert_eq!(tracker.current(), 600);
        // Can add within standard-tier limit
        assert!(permit2.track(100, 600).is_ok());
        assert_eq!(tracker.current(), 700);
        // Cannot exceed standard-tier limit
        assert!(permit2.track(1, 700).is_err());
        assert_eq!(tracker.current(), 700);
        permit2.release(700);
        assert_eq!(tracker.current(), 0);
    }
 }
--- a/src/common/recordbatch/src/recordbatch.rs
+++ b/src/common/recordbatch/src/recordbatch.rs
@@ -23,6 +23,7 @@ use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef a
 use datatypes::arrow::array::RecordBatchOptions;
 use datatypes::prelude::DataType;
 use datatypes::schema::SchemaRef;
 use datatypes::value::Value;
 use datatypes::vectors::{Helper, VectorRef};
 use serde::ser::{Error, SerializeStruct};
 use serde::{Serialize, Serializer};
@@ -193,6 +194,11 @@ impl RecordBatch {
        self.df_record_batch.num_rows()
    }
    /// Create an iterator to traverse the data by row
    pub fn rows(&self) -> RecordBatchRowIterator<'_> {
        RecordBatchRowIterator::new(self)
    }
    pub fn column_vectors(
        &self,
        table_name: &str,
@@ -271,6 +277,44 @@ impl Serialize for RecordBatch {
    }
 }
 pub struct RecordBatchRowIterator<'a> {
    record_batch: &'a RecordBatch,
    rows: usize,
    columns: usize,
    row_cursor: usize,
 }
 impl<'a> RecordBatchRowIterator<'a> {
    fn new(record_batch: &'a RecordBatch) -> RecordBatchRowIterator<'a> {
        RecordBatchRowIterator {
            record_batch,
            rows: record_batch.df_record_batch.num_rows(),
            columns: record_batch.df_record_batch.num_columns(),
            row_cursor: 0,
        }
    }
 }
 impl Iterator for RecordBatchRowIterator<'_> {
    type Item = Vec<Value>;
    fn next(&mut self) -> Option<Self::Item> {
        if self.row_cursor == self.rows {
            None
        } else {
            let mut row = Vec::with_capacity(self.columns);
            for col in 0..self.columns {
                let column = self.record_batch.column(col);
                row.push(column.get(self.row_cursor));
            }
            self.row_cursor += 1;
            Some(row)
        }
    }
 }
 /// merge multiple recordbatch into a single
 pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Result<RecordBatch> {
    let batches_len = batches.len();
@@ -305,9 +349,7 @@ pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Resul
 mod tests {
    use std::sync::Arc;
-    use datatypes::arrow::array::{AsArray, UInt32Array};
+    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, UInt32Type};
    use datatypes::arrow_array::StringArray;
    use datatypes::data_type::ConcreteDataType;
    use datatypes::schema::{ColumnSchema, Schema};
    use datatypes::vectors::{StringVector, UInt32Vector};
@@ -365,6 +407,64 @@ mod tests {
        );
    }
    #[test]
    fn test_record_batch_visitor() {
        let column_schemas = vec![
            ColumnSchema::new("numbers", ConcreteDataType::uint32_datatype(), false),
            ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true),
        ];
        let schema = Arc::new(Schema::new(column_schemas));
        let columns: Vec<VectorRef> = vec![
            Arc::new(UInt32Vector::from_slice(vec![1, 2, 3, 4])),
            Arc::new(StringVector::from(vec![
                None,
                Some("hello"),
                Some("greptime"),
                None,
            ])),
        ];
        let recordbatch = RecordBatch::new(schema, columns).unwrap();
        let mut record_batch_iter = recordbatch.rows();
        assert_eq!(
            vec![Value::UInt32(1), Value::Null],
            record_batch_iter
                .next()
                .unwrap()
                .into_iter()
                .collect::<Vec<Value>>()
        );
        assert_eq!(
            vec![Value::UInt32(2), Value::String("hello".into())],
            record_batch_iter
                .next()
                .unwrap()
                .into_iter()
                .collect::<Vec<Value>>()
        );
        assert_eq!(
            vec![Value::UInt32(3), Value::String("greptime".into())],
            record_batch_iter
                .next()
                .unwrap()
                .into_iter()
                .collect::<Vec<Value>>()
        );
        assert_eq!(
            vec![Value::UInt32(4), Value::Null],
            record_batch_iter
                .next()
                .unwrap()
                .into_iter()
                .collect::<Vec<Value>>()
        );
        assert!(record_batch_iter.next().is_none());
    }
    #[test]
    fn test_record_batch_slice() {
        let column_schemas = vec![
@@ -383,16 +483,26 @@ mod tests {
        ];
        let recordbatch = RecordBatch::new(schema, columns).unwrap();
        let recordbatch = recordbatch.slice(1, 2).expect("recordbatch slice");
        let mut record_batch_iter = recordbatch.rows();
        assert_eq!(
            vec![Value::UInt32(2), Value::String("hello".into())],
            record_batch_iter
                .next()
                .unwrap()
                .into_iter()
                .collect::<Vec<Value>>()
        );
-        let expected = &UInt32Array::from_iter_values([2u32, 3]);
+        assert_eq!(
-        let array = recordbatch.column(0).to_arrow_array();
+            vec![Value::UInt32(3), Value::String("greptime".into())],
-        let actual = array.as_primitive::<UInt32Type>();
+            record_batch_iter
-        assert_eq!(expected, actual);
+                .next()
                .unwrap()
                .into_iter()
                .collect::<Vec<Value>>()
        );
-        let expected = &StringArray::from(vec!["hello", "greptime"]);
+        assert!(record_batch_iter.next().is_none());
        let array = recordbatch.column(1).to_arrow_array();
        let actual = array.as_string::<i32>();
        assert_eq!(expected, actual);
        assert!(recordbatch.slice(1, 5).is_err());
    }
--- a/src/common/time/src/timezone.rs
+++ b/src/common/time/src/timezone.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 use std::fmt::Display;
 use std::str::FromStr;
 use chrono::{FixedOffset, TimeZone};
 use chrono_tz::{OffsetComponents, Tz};
@@ -101,7 +102,7 @@ impl Timezone {
                .parse::<u32>()
                .context(ParseOffsetStrSnafu { raw: tz_string })?;
            Self::hours_mins_opt(hrs, mins)
-        } else if let Ok(tz) = Tz::from_str_insensitive(tz_string) {
+        } else if let Ok(tz) = Tz::from_str(tz_string) {
            Ok(Self::Named(tz))
        } else {
            ParseTimezoneNameSnafu { raw: tz_string }.fail()
@@ -202,10 +203,6 @@ mod tests {
            Timezone::Named(Tz::Asia__Shanghai),
            Timezone::from_tz_string("Asia/Shanghai").unwrap()
        );
        assert_eq!(
            Timezone::Named(Tz::Asia__Shanghai),
            Timezone::from_tz_string("Asia/ShangHai").unwrap()
        );
        assert_eq!(
            Timezone::Named(Tz::UTC),
            Timezone::from_tz_string("UTC").unwrap()
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -522,7 +522,6 @@ impl DatanodeBuilder {
                    file_ref_manager,
                    partition_expr_fetcher.clone(),
                    plugins,
                    opts.max_concurrent_queries,
                );
                #[cfg(feature = "enterprise")]
@@ -565,7 +564,6 @@ impl DatanodeBuilder {
                    file_ref_manager,
                    partition_expr_fetcher,
                    plugins,
                    opts.max_concurrent_queries,
                );
                #[cfg(feature = "enterprise")]
@@ -587,7 +585,6 @@ impl DatanodeBuilder {
                    file_ref_manager,
                    partition_expr_fetcher.clone(),
                    plugins,
                    opts.max_concurrent_queries,
                );
                #[cfg(feature = "enterprise")]
--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -44,6 +44,7 @@ use crate::region_server::RegionServer;
 #[derive(Clone)]
 pub struct RegionHeartbeatResponseHandler {
    region_server: RegionServer,
    catchup_tasks: TaskTracker<()>,
    downgrade_tasks: TaskTracker<()>,
    flush_tasks: TaskTracker<()>,
    open_region_parallelism: usize,
@@ -63,6 +64,7 @@ pub trait InstructionHandler: Send + Sync {
 #[derive(Clone)]
 pub struct HandlerContext {
    region_server: RegionServer,
    catchup_tasks: TaskTracker<()>,
    downgrade_tasks: TaskTracker<()>,
    flush_tasks: TaskTracker<()>,
    gc_tasks: TaskTracker<GcReport>,
@@ -73,6 +75,7 @@ impl HandlerContext {
    pub fn new_for_test(region_server: RegionServer) -> Self {
        Self {
            region_server,
            catchup_tasks: TaskTracker::new(),
            downgrade_tasks: TaskTracker::new(),
            flush_tasks: TaskTracker::new(),
            gc_tasks: TaskTracker::new(),
@@ -85,6 +88,7 @@ impl RegionHeartbeatResponseHandler {
    pub fn new(region_server: RegionServer) -> Self {
        Self {
            region_server,
            catchup_tasks: TaskTracker::new(),
            downgrade_tasks: TaskTracker::new(),
            flush_tasks: TaskTracker::new(),
            // Default to half of the number of CPUs.
@@ -110,12 +114,7 @@ impl RegionHeartbeatResponseHandler {
            )),
            Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())),
            Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())),
-            Instruction::UpgradeRegions(_) => Ok(Box::new(
+            Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler.into())),
                UpgradeRegionsHandler {
                    upgrade_region_parallelism: self.open_region_parallelism,
                }
                .into(),
            )),
            Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())),
            Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())),
            Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
@@ -195,7 +194,7 @@ dispatch_instr!(
    OpenRegions => OpenRegions,
    FlushRegions => FlushRegions,
    DowngradeRegions => DowngradeRegions,
-    UpgradeRegions => UpgradeRegions,
+    UpgradeRegion => UpgradeRegions,
    GetFileRefs => GetFileRefs,
    GcRegions => GcRegions,
 );
@@ -217,6 +216,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
        let mailbox = ctx.mailbox.clone();
        let region_server = self.region_server.clone();
        let catchup_tasks = self.catchup_tasks.clone();
        let downgrade_tasks = self.downgrade_tasks.clone();
        let flush_tasks = self.flush_tasks.clone();
        let gc_tasks = self.gc_tasks.clone();
@@ -226,6 +226,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
                .handle(
                    &HandlerContext {
                        region_server,
                        catchup_tasks,
                        downgrade_tasks,
                        flush_tasks,
                        gc_tasks,
@@ -333,10 +334,10 @@ mod tests {
        );
        // Upgrade region
-        let instruction = Instruction::UpgradeRegions(vec![UpgradeRegion {
+        let instruction = Instruction::UpgradeRegion(UpgradeRegion {
            region_id,
            ..Default::default()
-        }]);
+        });
        assert!(
            heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction)))
        );
--- a/src/datanode/src/heartbeat/handler/upgrade_region.rs
+++ b/src/datanode/src/heartbeat/handler/upgrade_region.rs
@@ -12,209 +12,125 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use common_error::ext::{BoxedError, ErrorExt};
+use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply};
-use common_error::status_code::StatusCode;
+use common_telemetry::{info, warn};
-use common_meta::instruction::{
+use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint};
    InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply,
 };
 use common_telemetry::{debug, info, warn};
 use store_api::region_request::{RegionCatchupRequest, ReplayCheckpoint};
 use store_api::storage::RegionId;
 use crate::error::Result;
 use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
 use crate::heartbeat::task_tracker::WaitResult;
 #[derive(Debug, Clone, Copy, Default)]
-pub struct UpgradeRegionsHandler {
+pub struct UpgradeRegionsHandler;
    pub upgrade_region_parallelism: usize,
 }
 #[cfg(test)]
 impl UpgradeRegionsHandler {
    fn new_test() -> UpgradeRegionsHandler {
        UpgradeRegionsHandler {
            upgrade_region_parallelism: 8,
        }
    }
 }
 impl UpgradeRegionsHandler {
    fn convert_responses_to_replies(
        responses: Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>>,
        catchup_regions: &[RegionId],
    ) -> Vec<UpgradeRegionReply> {
        match responses {
            Ok(responses) => responses
                .into_iter()
                .map(|(region_id, result)| match result {
                    Ok(()) => UpgradeRegionReply {
                        region_id,
                        ready: true,
                        exists: true,
                        error: None,
                    },
                    Err(err) => {
                        if err.status_code() == StatusCode::RegionNotFound {
                            UpgradeRegionReply {
                                region_id,
                                ready: false,
                                exists: false,
                                error: Some(format!("{err:?}")),
                            }
                        } else {
                            UpgradeRegionReply {
                                region_id,
                                ready: false,
                                exists: true,
                                error: Some(format!("{err:?}")),
                            }
                        }
                    }
                })
                .collect::<Vec<_>>(),
            Err(err) => catchup_regions
                .iter()
                .map(|region_id| UpgradeRegionReply {
                    region_id: *region_id,
                    ready: false,
                    exists: true,
                    error: Some(format!("{err:?}")),
                })
                .collect::<Vec<_>>(),
        }
    }
 }
 impl UpgradeRegionsHandler {
    // Handles upgrade regions instruction.
    //
    // Returns batch of upgrade region replies, the order of the replies is not guaranteed.
    async fn handle_upgrade_regions(
        &self,
        ctx: &HandlerContext,
        upgrade_regions: Vec<UpgradeRegion>,
    ) -> Vec<UpgradeRegionReply> {
        let num_upgrade_regions = upgrade_regions.len();
        let mut replies = Vec::with_capacity(num_upgrade_regions);
        let mut catchup_requests = Vec::with_capacity(num_upgrade_regions);
        let mut catchup_regions = Vec::with_capacity(num_upgrade_regions);
        let mut timeout = None;
        for upgrade_region in upgrade_regions {
            let Some(writable) = ctx.region_server.is_region_leader(upgrade_region.region_id)
            else {
                // Region is not found.
                debug!("Region {} is not found", upgrade_region.region_id);
                replies.push(UpgradeRegionReply {
                    region_id: upgrade_region.region_id,
                    ready: false,
                    exists: false,
                    error: None,
                });
                continue;
            };
            // Ignores the catchup requests for writable regions.
            if writable {
                warn!(
                    "Region {} is writable, ignores the catchup request",
                    upgrade_region.region_id
                );
                replies.push(UpgradeRegionReply {
                    region_id: upgrade_region.region_id,
                    ready: true,
                    exists: true,
                    error: None,
                });
            } else {
                let UpgradeRegion {
                    last_entry_id,
                    metadata_last_entry_id,
                    location_id,
                    replay_entry_id,
                    metadata_replay_entry_id,
                    replay_timeout,
                    ..
                } = upgrade_region;
                match timeout {
                    Some(timeout) => {
                        debug_assert_eq!(timeout, replay_timeout);
                    }
                    None => {
                        // TODO(weny): required the replay_timeout.
                        timeout = Some(replay_timeout);
                    }
                }
                let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
                    (Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
                        entry_id,
                        metadata_entry_id,
                    }),
                    _ => None,
                };
                catchup_regions.push(upgrade_region.region_id);
                catchup_requests.push((
                    upgrade_region.region_id,
                    RegionCatchupRequest {
                        set_writable: true,
                        entry_id: last_entry_id,
                        metadata_entry_id: metadata_last_entry_id,
                        location_id,
                        checkpoint,
                    },
                ));
            }
        }
        let Some(timeout) = timeout else {
            // No replay timeout, so we don't need to catchup the regions.
            info!("All regions are writable, no need to catchup");
            debug_assert_eq!(replies.len(), num_upgrade_regions);
            return replies;
        };
        match tokio::time::timeout(
            timeout,
            ctx.region_server
                .handle_batch_catchup_requests(self.upgrade_region_parallelism, catchup_requests),
        )
        .await
        {
            Ok(responses) => {
                replies.extend(
                    Self::convert_responses_to_replies(responses, &catchup_regions).into_iter(),
                );
            }
            Err(_) => {
                replies.extend(catchup_regions.iter().map(|region_id| UpgradeRegionReply {
                    region_id: *region_id,
                    ready: false,
                    exists: true,
                    error: None,
                }));
            }
        }
        replies
    }
 }
 #[async_trait::async_trait]
 impl InstructionHandler for UpgradeRegionsHandler {
-    type Instruction = Vec<UpgradeRegion>;
+    type Instruction = UpgradeRegion;
    async fn handle(
        &self,
        ctx: &HandlerContext,
-        upgrade_regions: Self::Instruction,
+        UpgradeRegion {
            region_id,
            last_entry_id,
            metadata_last_entry_id,
            replay_timeout,
            location_id,
            replay_entry_id,
            metadata_replay_entry_id,
        }: UpgradeRegion,
    ) -> Option<InstructionReply> {
-        let replies = self.handle_upgrade_regions(ctx, upgrade_regions).await;
+        let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
            return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
                ready: false,
                exists: false,
                error: None,
            }));
        };
-        Some(InstructionReply::UpgradeRegions(UpgradeRegionsReply::new(
+        if writable {
-            replies,
+            return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
-        )))
+                ready: true,
                exists: true,
                error: None,
            }));
        }
        let region_server_moved = ctx.region_server.clone();
        let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
            (Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
                entry_id,
                metadata_entry_id,
            }),
            _ => None,
        };
        // The catchup task is almost zero cost if the inside region is writable.
        // Therefore, it always registers a new catchup task.
        let register_result = ctx
            .catchup_tasks
            .try_register(
                region_id,
                Box::pin(async move {
                    info!(
                        "Executing region: {region_id} catchup to: last entry id {last_entry_id:?}"
                    );
                    region_server_moved
                        .handle_request(
                            region_id,
                            RegionRequest::Catchup(RegionCatchupRequest {
                                set_writable: true,
                                entry_id: last_entry_id,
                                metadata_entry_id: metadata_last_entry_id,
                                location_id,
                                checkpoint,
                            }),
                        )
                        .await?;
                    Ok(())
                }),
            )
            .await;
        if register_result.is_busy() {
            warn!("Another catchup task is running for the region: {region_id}");
        }
        // Returns immediately
        let Some(replay_timeout) = replay_timeout else {
            return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
                ready: false,
                exists: true,
                error: None,
            }));
        };
        // We don't care that it returns a newly registered or running task.
        let mut watcher = register_result.into_watcher();
        let result = ctx.catchup_tasks.wait(&mut watcher, replay_timeout).await;
        match result {
            WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
                ready: false,
                exists: true,
                error: None,
            })),
            WaitResult::Finish(Ok(_)) => {
                Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
                    ready: true,
                    exists: true,
                    error: None,
                }))
            }
            WaitResult::Finish(Err(err)) => {
                Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
                    ready: false,
                    exists: true,
                    error: Some(format!("{err:?}")),
                }))
            }
        }
    }
 }
@@ -226,6 +142,7 @@ mod tests {
    use mito2::engine::MITO_ENGINE_NAME;
    use store_api::region_engine::RegionRole;
    use store_api::storage::RegionId;
    use tokio::time::Instant;
    use crate::error;
    use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler;
@@ -241,30 +158,21 @@ mod tests {
        let handler_context = HandlerContext::new_for_test(mock_region_server);
        let region_id = RegionId::new(1024, 1);
-        let region_id2 = RegionId::new(1024, 2);
+        let waits = vec![None, Some(Duration::from_millis(100u64))];
-        let replay_timeout = Duration::from_millis(100u64);
+
-        let reply = UpgradeRegionsHandler::new_test()
+        for replay_timeout in waits {
-            .handle(
+            let reply = UpgradeRegionsHandler
-                &handler_context,
+                .handle(
-                vec![
+                    &handler_context,
                    UpgradeRegion {
                        region_id,
                        replay_timeout,
                        ..Default::default()
                    },
-                    UpgradeRegion {
+                )
-                        region_id: region_id2,
+                .await;
                        replay_timeout,
                        ..Default::default()
                    },
                ],
            )
            .await;
-        let replies = &reply.unwrap().expect_upgrade_regions_reply();
+            let reply = reply.unwrap().expect_upgrade_region_reply();
        assert_eq!(replies[0].region_id, region_id);
        assert_eq!(replies[1].region_id, region_id2);
        for reply in replies {
            assert!(!reply.exists);
            assert!(reply.error.is_none());
        }
@@ -274,7 +182,6 @@ mod tests {
    async fn test_region_writable() {
        let mock_region_server = mock_region_server();
        let region_id = RegionId::new(1024, 1);
        let region_id2 = RegionId::new(1024, 2);
        let (mock_engine, _) =
            MockRegionEngine::with_custom_apply_fn(MITO_ENGINE_NAME, |region_engine| {
@@ -284,32 +191,25 @@ mod tests {
                    unreachable!();
                }));
            });
-        mock_region_server.register_test_region(region_id, mock_engine.clone());
+        mock_region_server.register_test_region(region_id, mock_engine);
-        mock_region_server.register_test_region(region_id2, mock_engine);
+
        let handler_context = HandlerContext::new_for_test(mock_region_server);
-        let replay_timeout = Duration::from_millis(100u64);
+
-        let reply = UpgradeRegionsHandler::new_test()
+        let waits = vec![None, Some(Duration::from_millis(100u64))];
-            .handle(
+
-                &handler_context,
+        for replay_timeout in waits {
-                vec![
+            let reply = UpgradeRegionsHandler
                .handle(
                    &handler_context,
                    UpgradeRegion {
                        region_id,
                        replay_timeout,
                        ..Default::default()
                    },
-                    UpgradeRegion {
+                )
-                        region_id: region_id2,
+                .await;
                        replay_timeout,
                        ..Default::default()
                    },
                ],
            )
            .await;
-        let replies = &reply.unwrap().expect_upgrade_regions_reply();
+            let reply = reply.unwrap().expect_upgrade_region_reply();
        assert_eq!(replies[0].region_id, region_id);
        assert_eq!(replies[1].region_id, region_id2);
        for reply in replies {
            assert!(reply.ready);
            assert!(reply.exists);
            assert!(reply.error.is_none());
@@ -332,27 +232,30 @@ mod tests {
        mock_region_server.register_test_region(region_id, mock_engine);
        let handler_context = HandlerContext::new_for_test(mock_region_server);
        let replay_timeout = Duration::from_millis(100u64);
        let reply = UpgradeRegionsHandler::new_test()
            .handle(
                &handler_context,
                vec![UpgradeRegion {
                    region_id,
                    replay_timeout,
                    ..Default::default()
                }],
            )
            .await;
-        let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
+        let waits = vec![None, Some(Duration::from_millis(100u64))];
-        assert!(!reply.ready);
+
-        assert!(reply.exists);
+        for replay_timeout in waits {
-        assert!(reply.error.is_none(), "error: {:?}", reply.error);
+            let reply = UpgradeRegionsHandler
                .handle(
                    &handler_context,
                    UpgradeRegion {
                        region_id,
                        replay_timeout,
                        ..Default::default()
                    },
                )
                .await;
            let reply = reply.unwrap().expect_upgrade_region_reply();
            assert!(!reply.ready);
            assert!(reply.exists);
            assert!(reply.error.is_none());
        }
    }
    #[tokio::test]
    async fn test_region_not_ready_with_retry() {
        common_telemetry::init_default_ut_logging();
        let mock_region_server = mock_region_server();
        let region_id = RegionId::new(1024, 1);
@@ -361,48 +264,58 @@ mod tests {
                // Region is not ready.
                region_engine.mock_role = Some(Some(RegionRole::Follower));
                region_engine.handle_request_mock_fn = Some(Box::new(|_, _| Ok(0)));
                // Note: Don't change.
                region_engine.handle_request_delay = Some(Duration::from_millis(300));
            });
        mock_region_server.register_test_region(region_id, mock_engine);
-        let waits = vec![Duration::from_millis(100u64), Duration::from_millis(100u64)];
+
        let waits = vec![
            Some(Duration::from_millis(100u64)),
            Some(Duration::from_millis(100u64)),
        ];
        let handler_context = HandlerContext::new_for_test(mock_region_server);
        for replay_timeout in waits {
-            let reply = UpgradeRegionsHandler::new_test()
+            let reply = UpgradeRegionsHandler
                .handle(
                    &handler_context,
-                    vec![UpgradeRegion {
+                    UpgradeRegion {
                        region_id,
                        replay_timeout,
                        ..Default::default()
-                    }],
+                    },
                )
                .await;
-            let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
+            let reply = reply.unwrap().expect_upgrade_region_reply();
            assert!(!reply.ready);
            assert!(reply.exists);
-            assert!(reply.error.is_none(), "error: {:?}", reply.error);
+            assert!(reply.error.is_none());
        }
-        let reply = UpgradeRegionsHandler::new_test()
+        let timer = Instant::now();
        let reply = UpgradeRegionsHandler
            .handle(
                &handler_context,
-                vec![UpgradeRegion {
+                UpgradeRegion {
                    region_id,
-                    replay_timeout: Duration::from_millis(500),
+                    replay_timeout: Some(Duration::from_millis(500)),
                    ..Default::default()
-                }],
+                },
            )
            .await;
-        let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
+        // Must less than 300 ms.
        assert!(timer.elapsed().as_millis() < 300);
        let reply = reply.unwrap().expect_upgrade_region_reply();
        assert!(reply.ready);
        assert!(reply.exists);
-        assert!(reply.error.is_none(), "error: {:?}", reply.error);
+        assert!(reply.error.is_none());
    }
    #[tokio::test]
    async fn test_region_error() {
        common_telemetry::init_default_ut_logging();
        let mock_region_server = mock_region_server();
        let region_id = RegionId::new(1024, 1);
@@ -422,37 +335,38 @@ mod tests {
        mock_region_server.register_test_region(region_id, mock_engine);
        let handler_context = HandlerContext::new_for_test(mock_region_server);
-        let reply = UpgradeRegionsHandler::new_test()
+
        let reply = UpgradeRegionsHandler
            .handle(
                &handler_context,
-                vec![UpgradeRegion {
+                UpgradeRegion {
                    region_id,
                    ..Default::default()
-                }],
+                },
            )
            .await;
        // It didn't wait for handle returns; it had no idea about the error.
-        let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
+        let reply = reply.unwrap().expect_upgrade_region_reply();
        assert!(!reply.ready);
        assert!(reply.exists);
        assert!(reply.error.is_none());
-        let reply = UpgradeRegionsHandler::new_test()
+        let reply = UpgradeRegionsHandler
            .handle(
                &handler_context,
-                vec![UpgradeRegion {
+                UpgradeRegion {
                    region_id,
-                    replay_timeout: Duration::from_millis(200),
+                    replay_timeout: Some(Duration::from_millis(200)),
                    ..Default::default()
-                }],
+                },
            )
            .await;
-        let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
+        let reply = reply.unwrap().expect_upgrade_region_reply();
        assert!(!reply.ready);
        assert!(reply.exists);
        assert!(reply.error.is_some());
-        assert!(reply.error.as_ref().unwrap().contains("mock_error"));
+        assert!(reply.error.unwrap().contains("mock_error"));
    }
 }
--- a/src/datanode/src/metrics.rs
+++ b/src/datanode/src/metrics.rs
@@ -75,20 +75,4 @@ lazy_static! {
        &[RESULT_TYPE]
    )
    .unwrap();
    /// Total count of failed region server requests.
    pub static ref REGION_SERVER_REQUEST_FAILURE_COUNT: IntCounterVec = register_int_counter_vec!(
        "greptime_datanode_region_request_fail_count",
        "failed region server requests count",
        &[REGION_REQUEST_TYPE]
    )
    .unwrap();
    /// Total count of failed insert requests to region server.
    pub static ref REGION_SERVER_INSERT_FAIL_COUNT: IntCounterVec = register_int_counter_vec!(
        "greptime_datanode_region_failed_insert_count",
        "failed region server insert requests count",
        &[REGION_REQUEST_TYPE]
    )
    .unwrap();
 }
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -66,8 +66,7 @@ use store_api::region_engine::{
    SettableRegionRoleState,
 };
 use store_api::region_request::{
-    AffectedRows, BatchRegionDdlRequest, RegionCatchupRequest, RegionCloseRequest,
+    AffectedRows, BatchRegionDdlRequest, RegionCloseRequest, RegionOpenRequest, RegionRequest,
    RegionOpenRequest, RegionRequest,
 };
 use store_api::storage::RegionId;
 use tokio::sync::{Semaphore, SemaphorePermit};
@@ -192,17 +191,6 @@ impl RegionServer {
            .await
    }
    #[tracing::instrument(skip_all)]
    pub async fn handle_batch_catchup_requests(
        &self,
        parallelism: usize,
        requests: Vec<(RegionId, RegionCatchupRequest)>,
    ) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
        self.inner
            .handle_batch_catchup_requests(parallelism, requests)
            .await
    }
    #[tracing::instrument(skip_all, fields(request_type = request.request_type()))]
    pub async fn handle_request(
        &self,
@@ -411,14 +399,6 @@ impl RegionServer {
    #[cfg(test)]
    /// Registers a region for test purpose.
    pub(crate) fn register_test_region(&self, region_id: RegionId, engine: RegionEngineRef) {
        {
            let mut engines = self.inner.engines.write().unwrap();
            if !engines.contains_key(engine.name()) {
                debug!("Registering test engine: {}", engine.name());
                engines.insert(engine.name().to_string(), engine.clone());
            }
        }
        self.inner
            .region_map
            .insert(region_id, RegionEngineWithStatus::Ready(engine));
@@ -600,8 +580,6 @@ impl RegionServer {
 #[async_trait]
 impl RegionServerHandler for RegionServer {
    async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponseV1> {
        let failed_requests_cnt = crate::metrics::REGION_SERVER_REQUEST_FAILURE_COUNT
            .with_label_values(&[request.as_ref()]);
        let response = match &request {
            region_request::Body::Creates(_)
            | region_request::Body::Drops(_)
@@ -619,9 +597,6 @@ impl RegionServerHandler for RegionServer {
            _ => self.handle_requests_in_serial(request).await,
        }
        .map_err(BoxedError::new)
        .inspect_err(|_| {
            failed_requests_cnt.inc();
        })
        .context(ExecuteGrpcRequestSnafu)?;
        Ok(RegionResponseV1 {
@@ -997,116 +972,6 @@ impl RegionServerInner {
            .collect::<Vec<_>>())
    }
    pub async fn handle_batch_catchup_requests_inner(
        &self,
        engine: RegionEngineRef,
        parallelism: usize,
        requests: Vec<(RegionId, RegionCatchupRequest)>,
    ) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
        for (region_id, _) in &requests {
            self.set_region_status_not_ready(*region_id, &engine, &RegionChange::Catchup);
        }
        let region_ids = requests
            .iter()
            .map(|(region_id, _)| *region_id)
            .collect::<Vec<_>>();
        let mut responses = Vec::with_capacity(requests.len());
        match engine
            .handle_batch_catchup_requests(parallelism, requests)
            .await
        {
            Ok(results) => {
                for (region_id, result) in results {
                    match result {
                        Ok(_) => {
                            if let Err(e) = self
                                .set_region_status_ready(
                                    region_id,
                                    engine.clone(),
                                    RegionChange::Catchup,
                                )
                                .await
                            {
                                error!(e; "Failed to set region to ready: {}", region_id);
                                responses.push((region_id, Err(BoxedError::new(e))));
                            } else {
                                responses.push((region_id, Ok(())));
                            }
                        }
                        Err(e) => {
                            self.unset_region_status(region_id, &engine, RegionChange::Catchup);
                            error!(e; "Failed to catchup region: {}", region_id);
                            responses.push((region_id, Err(e)));
                        }
                    }
                }
            }
            Err(e) => {
                for region_id in region_ids {
                    self.unset_region_status(region_id, &engine, RegionChange::Catchup);
                }
                error!(e; "Failed to catchup batch regions");
                return error::UnexpectedSnafu {
                    violated: format!("Failed to catchup batch regions: {:?}", e),
                }
                .fail();
            }
        }
        Ok(responses)
    }
    pub async fn handle_batch_catchup_requests(
        &self,
        parallelism: usize,
        requests: Vec<(RegionId, RegionCatchupRequest)>,
    ) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
        let mut engine_grouped_requests: HashMap<String, Vec<_>> = HashMap::new();
        let mut responses = Vec::with_capacity(requests.len());
        for (region_id, request) in requests {
            if let Ok(engine) = self.get_engine(region_id, &RegionChange::Catchup) {
                match engine {
                    CurrentEngine::Engine(engine) => {
                        engine_grouped_requests
                            .entry(engine.name().to_string())
                            .or_default()
                            .push((region_id, request));
                    }
                    CurrentEngine::EarlyReturn(_) => {
                        return error::UnexpectedSnafu {
                            violated: format!("Unexpected engine type for region {}", region_id),
                        }
                        .fail();
                    }
                }
            } else {
                responses.push((
                    region_id,
                    Err(BoxedError::new(
                        error::RegionNotFoundSnafu { region_id }.build(),
                    )),
                ));
            }
        }
        for (engine, requests) in engine_grouped_requests {
            let engine = self
                .engines
                .read()
                .unwrap()
                .get(&engine)
                .with_context(|| RegionEngineNotFoundSnafu { name: &engine })?
                .clone();
            responses.extend(
                self.handle_batch_catchup_requests_inner(engine, parallelism, requests)
                    .await?,
            );
        }
        Ok(responses)
    }
    // Handle requests in batch.
    //
    // limitation: all create requests must be in the same engine.
@@ -1235,11 +1100,6 @@ impl RegionServerInner {
                })
            }
            Err(err) => {
                if matches!(region_change, RegionChange::Ingest) {
                    crate::metrics::REGION_SERVER_INSERT_FAIL_COUNT
                        .with_label_values(&[request_type])
                        .inc();
                }
                // Removes the region status if the operation fails.
                self.unset_region_status(region_id, &engine, region_change);
                Err(err)
--- a/src/datatypes/src/data_type.rs
+++ b/src/datatypes/src/data_type.rs
@@ -277,10 +277,6 @@ impl ConcreteDataType {
        matches!(self, ConcreteDataType::Null(NullType))
    }
    pub(crate) fn is_struct(&self) -> bool {
        matches!(self, ConcreteDataType::Struct(_))
    }
    /// Try to cast the type as a [`ListType`].
    pub fn as_list(&self) -> Option<&ListType> {
        match self {
--- a/src/datatypes/src/error.rs
+++ b/src/datatypes/src/error.rs
@@ -266,14 +266,6 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
    #[snafu(display("Failed to parse or serialize arrow metadata"))]
    ArrowMetadata {
        #[snafu(source)]
        error: arrow::error::ArrowError,
        #[snafu(implicit)]
        location: Location,
    },
 }
 impl ErrorExt for Error {
@@ -315,8 +307,7 @@ impl ErrorExt for Error {
            | ConvertArrowArrayToScalars { .. }
            | ConvertScalarToArrowArray { .. }
            | ParseExtendedType { .. }
-            | InconsistentStructFieldsAndItems { .. }
+            | InconsistentStructFieldsAndItems { .. } => StatusCode::Internal,
            | ArrowMetadata { .. } => StatusCode::Internal,
        }
    }
--- a/src/datatypes/src/extension.rs
+++ b/src/datatypes/src/extension.rs
@@ -1,15 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 pub mod json;
--- a/src/datatypes/src/extension/json.rs
+++ b/src/datatypes/src/extension/json.rs
@@ -1,104 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::sync::Arc;
 use arrow_schema::extension::ExtensionType;
 use arrow_schema::{ArrowError, DataType};
 use serde::{Deserialize, Serialize};
 use crate::json::JsonStructureSettings;
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct JsonMetadata {
    /// Indicates how to handle JSON is stored in underlying data type
    ///
    /// This field can be `None` for data is converted to complete structured in-memory form.
    pub json_structure_settings: Option<JsonStructureSettings>,
 }
 #[derive(Debug, Clone)]
 pub struct JsonExtensionType(Arc<JsonMetadata>);
 impl JsonExtensionType {
    pub fn new(metadata: Arc<JsonMetadata>) -> Self {
        JsonExtensionType(metadata)
    }
 }
 impl ExtensionType for JsonExtensionType {
    const NAME: &'static str = "greptime.json";
    type Metadata = Arc<JsonMetadata>;
    fn metadata(&self) -> &Self::Metadata {
        &self.0
    }
    fn serialize_metadata(&self) -> Option<String> {
        serde_json::to_string(self.metadata()).ok()
    }
    fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
        if let Some(metadata) = metadata {
            let metadata = serde_json::from_str(metadata).map_err(|e| {
                ArrowError::ParseError(format!("Failed to deserialize JSON metadata: {}", e))
            })?;
            Ok(Arc::new(metadata))
        } else {
            Ok(Arc::new(JsonMetadata::default()))
        }
    }
    fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
        match data_type {
            // object
            DataType::Struct(_)
            // array
            | DataType::List(_)
            | DataType::ListView(_)
            | DataType::LargeList(_)
            | DataType::LargeListView(_)
            // string
            | DataType::Utf8
            | DataType::Utf8View
            | DataType::LargeUtf8
            // number
            | DataType::Int8
            | DataType::Int16
            | DataType::Int32
            | DataType::Int64
            | DataType::UInt8
            | DataType::UInt16
            | DataType::UInt32
            | DataType::UInt64
            | DataType::Float32
            | DataType::Float64
            // boolean
            | DataType::Boolean
            // null
            | DataType::Null
            // legacy json type
            | DataType::Binary => Ok(()),
            dt => Err(ArrowError::SchemaError(format!(
                "Unexpected data type {dt}"
            ))),
        }
    }
    fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
        let json = Self(metadata);
        json.supports_data_type(data_type)?;
        Ok(json)
    }
 }
--- a/src/datatypes/src/lib.rs
+++ b/src/datatypes/src/lib.rs
@@ -13,13 +13,11 @@
 // limitations under the License.
 #![feature(assert_matches)]
 #![feature(box_patterns)]
 pub mod arrow_array;
 pub mod data_type;
 pub mod duration;
 pub mod error;
 pub mod extension;
 pub mod interval;
 pub mod json;
 pub mod macros;
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -32,8 +32,9 @@ pub use crate::schema::column_schema::{
    COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_FULLTEXT_OPT_KEY_GRANULARITY,
    COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
    COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
-    FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
+    FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY,
-    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
+    JSON_STRUCTURE_SETTINGS_KEY, Metadata, SKIPPING_INDEX_KEY, SkippingIndexOptions,
    SkippingIndexType, TIME_INDEX_KEY,
 };
 pub use crate::schema::constraint::ColumnDefaultConstraint;
 pub use crate::schema::raw::RawSchema;
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -17,17 +17,13 @@ use std::fmt;
 use std::str::FromStr;
 use arrow::datatypes::Field;
 use arrow_schema::extension::{
    EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY, ExtensionType,
 };
 use serde::{Deserialize, Serialize};
 use snafu::{ResultExt, ensure};
 use sqlparser_derive::{Visit, VisitMut};
 use crate::data_type::{ConcreteDataType, DataType};
-use crate::error::{
+use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result};
-    self, ArrowMetadataSnafu, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result,
+use crate::json::JsonStructureSettings;
 };
 use crate::schema::TYPE_KEY;
 use crate::schema::constraint::ColumnDefaultConstraint;
 use crate::value::Value;
@@ -46,6 +42,7 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
 pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
 /// Key used to store skip options in arrow field's metadata.
 pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
 pub const JSON_STRUCTURE_SETTINGS_KEY: &str = "greptime:json:structure_settings";
 /// Keys used in fulltext options
 pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -397,38 +394,18 @@ impl ColumnSchema {
        Ok(())
    }
-    pub fn extension_type<E>(&self) -> Result<Option<E>>
+    pub fn json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
-    where
+        self.metadata
-        E: ExtensionType,
+            .get(JSON_STRUCTURE_SETTINGS_KEY)
-    {
+            .map(|json| serde_json::from_str(json).context(error::DeserializeSnafu { json }))
-        let extension_type_name = self.metadata.get(EXTENSION_TYPE_NAME_KEY);
+            .transpose()
        if extension_type_name.map(|s| s.as_str()) == Some(E::NAME) {
            let extension_metadata = self.metadata.get(EXTENSION_TYPE_METADATA_KEY);
            let extension_metadata =
                E::deserialize_metadata(extension_metadata.map(|s| s.as_str()))
                    .context(ArrowMetadataSnafu)?;
            let extension = E::try_new(&self.data_type.as_arrow_type(), extension_metadata)
                .context(ArrowMetadataSnafu)?;
            Ok(Some(extension))
        } else {
            Ok(None)
        }
    }
-    pub fn with_extension_type<E>(&mut self, extension_type: &E) -> Result<()>
+    pub fn with_json_structure_settings(&mut self, settings: &JsonStructureSettings) -> Result<()> {
-    where
+        self.metadata.insert(
-        E: ExtensionType,
+            JSON_STRUCTURE_SETTINGS_KEY.to_string(),
-    {
+            serde_json::to_string(settings).context(error::SerializeSnafu)?,
-        self.metadata
+        );
            .insert(EXTENSION_TYPE_NAME_KEY.to_string(), E::NAME.to_string());
        if let Some(extension_metadata) = extension_type.serialize_metadata() {
            self.metadata
                .insert(EXTENSION_TYPE_METADATA_KEY.to_string(), extension_metadata);
        }
        Ok(())
    }
 }
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -31,12 +31,9 @@ use crate::scalars::ScalarVectorBuilder;
 use crate::type_id::LogicalTypeId;
 use crate::types::{ListType, StructField, StructType};
 use crate::value::Value;
 use crate::vectors::json::builder::JsonVectorBuilder;
 use crate::vectors::{BinaryVectorBuilder, MutableVector};
 pub const JSON_TYPE_NAME: &str = "Json";
 const JSON_PLAIN_FIELD_NAME: &str = "__plain__";
 const JSON_PLAIN_FIELD_METADATA_KEY: &str = "is_plain_json";
 #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default)]
 pub enum JsonFormat {
@@ -57,46 +54,28 @@ impl JsonType {
        Self { format }
    }
-    pub(crate) fn empty() -> Self {
+    // TODO(LFC): remove "allow unused"
-        Self {
+    #[allow(unused)]
            format: JsonFormat::Native(Box::new(ConcreteDataType::null_datatype())),
        }
    }
    /// Make json type a struct type, by:
    /// - if the json is an object, its entries are mapped to struct fields, obviously;
    /// - if not, the json is one of bool, number, string or array, make it a special field called
-    ///   [JSON_PLAIN_FIELD_NAME] with metadata [JSON_PLAIN_FIELD_METADATA_KEY] = `"true"` in a
+    ///   "__plain" in a struct with only that field.
    ///   struct with only that field.
    pub(crate) fn as_struct_type(&self) -> StructType {
        match &self.format {
            JsonFormat::Jsonb => StructType::default(),
            JsonFormat::Native(inner) => match inner.as_ref() {
                ConcreteDataType::Struct(t) => t.clone(),
-                x => {
+                x => StructType::new(Arc::new(vec![StructField::new(
-                    let mut field =
+                    "__plain".to_string(),
-                        StructField::new(JSON_PLAIN_FIELD_NAME.to_string(), x.clone(), true);
+                    x.clone(),
-                    field.insert_metadata(JSON_PLAIN_FIELD_METADATA_KEY, true);
+                    true,
-                    StructType::new(Arc::new(vec![field]))
+                )])),
                }
            },
        }
    }
-    /// Check if this json type is the special "plain" one.
+    // TODO(LFC): remove "allow unused"
-    /// See [JsonType::as_struct_type].
+    #[allow(unused)]
    pub(crate) fn is_plain_json(&self) -> bool {
        let JsonFormat::Native(box ConcreteDataType::Struct(t)) = &self.format else {
            return true;
        };
        let fields = t.fields();
        let Some((single, [])) = fields.split_first() else {
            return false;
        };
        single.name() == JSON_PLAIN_FIELD_NAME
            && single.metadata(JSON_PLAIN_FIELD_METADATA_KEY) == Some("true")
    }
    /// Try to merge this json type with others, error on datatype conflict.
    pub(crate) fn merge(&mut self, other: &JsonType) -> Result<()> {
        match (&self.format, &other.format) {
@@ -112,47 +91,6 @@ impl JsonType {
            .fail(),
        }
    }
    pub(crate) fn is_mergeable(&self, other: &JsonType) -> bool {
        match (&self.format, &other.format) {
            (JsonFormat::Jsonb, JsonFormat::Jsonb) => true,
            (JsonFormat::Native(this), JsonFormat::Native(that)) => {
                is_mergeable(this.as_ref(), that.as_ref())
            }
            _ => false,
        }
    }
 }
 fn is_mergeable(this: &ConcreteDataType, that: &ConcreteDataType) -> bool {
    fn is_mergeable_struct(this: &StructType, that: &StructType) -> bool {
        let this_fields = this.fields();
        let this_fields = this_fields
            .iter()
            .map(|x| (x.name(), x))
            .collect::<HashMap<_, _>>();
        for that_field in that.fields().iter() {
            if let Some(this_field) = this_fields.get(that_field.name())
                && !is_mergeable(this_field.data_type(), that_field.data_type())
            {
                return false;
            }
        }
        true
    }
    match (this, that) {
        (this, that) if this == that => true,
        (ConcreteDataType::List(this), ConcreteDataType::List(that)) => {
            is_mergeable(this.item_type(), that.item_type())
        }
        (ConcreteDataType::Struct(this), ConcreteDataType::Struct(that)) => {
            is_mergeable_struct(this, that)
        }
        (ConcreteDataType::Null(_), _) | (_, ConcreteDataType::Null(_)) => true,
        _ => false,
    }
 }
 fn merge(this: &ConcreteDataType, that: &ConcreteDataType) -> Result<ConcreteDataType> {
@@ -228,10 +166,7 @@ impl DataType for JsonType {
    }
    fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
-        match self.format {
+        Box::new(BinaryVectorBuilder::with_capacity(capacity))
            JsonFormat::Jsonb => Box::new(BinaryVectorBuilder::with_capacity(capacity)),
            JsonFormat::Native(_) => Box::new(JsonVectorBuilder::with_capacity(capacity)),
        }
    }
    fn try_cast(&self, from: Value) -> Option<Value> {
@@ -291,12 +226,10 @@ mod tests {
            let result = json_type.merge(other);
            match (result, expected) {
                (Ok(()), Ok(expected)) => {
-                    assert_eq!(json_type.name(), expected);
+                    assert_eq!(json_type.name(), expected)
                    assert!(json_type.is_mergeable(other));
                }
                (Err(err), Err(expected)) => {
-                    assert_eq!(err.to_string(), expected);
+                    assert_eq!(err.to_string(), expected)
                    assert!(!json_type.is_mergeable(other));
                }
                _ => unreachable!(),
            }
--- a/src/datatypes/src/types/struct_type.rs
+++ b/src/datatypes/src/types/struct_type.rs
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::collections::BTreeMap;
 use std::sync::Arc;
 use arrow::datatypes::{DataType as ArrowDataType, Field};
@@ -47,15 +46,6 @@ impl TryFrom<&Fields> for StructType {
    }
 }
 impl<const N: usize> From<[StructField; N]> for StructType {
    fn from(value: [StructField; N]) -> Self {
        let value: Box<[StructField]> = Box::new(value);
        Self {
            fields: Arc::new(value.into_vec()),
        }
    }
 }
 impl DataType for StructType {
    fn name(&self) -> String {
        format!(
@@ -118,7 +108,6 @@ pub struct StructField {
    name: String,
    data_type: ConcreteDataType,
    nullable: bool,
    metadata: BTreeMap<String, String>,
 }
 impl StructField {
@@ -127,7 +116,6 @@ impl StructField {
            name,
            data_type,
            nullable,
            metadata: BTreeMap::new(),
        }
    }
@@ -147,25 +135,11 @@ impl StructField {
        self.nullable
    }
    pub(crate) fn insert_metadata(&mut self, key: impl ToString, value: impl ToString) {
        self.metadata.insert(key.to_string(), value.to_string());
    }
    pub(crate) fn metadata(&self, key: &str) -> Option<&str> {
        self.metadata.get(key).map(String::as_str)
    }
    pub fn to_df_field(&self) -> Field {
        let metadata = self
            .metadata
            .iter()
            .map(|(k, v)| (k.clone(), v.clone()))
            .collect();
        Field::new(
            self.name.clone(),
            self.data_type.as_arrow_type(),
            self.nullable,
        )
        .with_metadata(metadata)
    }
 }
--- a/src/datatypes/src/value.rs
+++ b/src/datatypes/src/value.rs
@@ -873,12 +873,6 @@ impl From<&[u8]> for Value {
    }
 }
 impl From<()> for Value {
    fn from(_: ()) -> Self {
        Value::Null
    }
 }
 impl TryFrom<Value> for serde_json::Value {
    type Error = serde_json::Error;
--- a/src/datatypes/src/vectors.rs
+++ b/src/datatypes/src/vectors.rs
@@ -35,7 +35,6 @@ mod duration;
 mod eq;
 mod helper;
 mod interval;
 pub(crate) mod json;
 mod list;
 mod null;
 pub(crate) mod operations;
--- a/src/datatypes/src/vectors/helper.rs
+++ b/src/datatypes/src/vectors/helper.rs
@@ -464,14 +464,6 @@ impl Helper {
    }
 }
 #[cfg(test)]
 pub(crate) fn pretty_print(vector: VectorRef) -> String {
    let array = vector.to_arrow_array();
    arrow::util::pretty::pretty_format_columns(&vector.vector_type_name(), &[array])
        .map(|x| x.to_string())
        .unwrap_or_else(|e| e.to_string())
 }
 #[cfg(test)]
 mod tests {
    use arrow::array::{
--- a/src/datatypes/src/vectors/json.rs
+++ b/src/datatypes/src/vectors/json.rs
@@ -1,15 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 pub(crate) mod builder;
--- a/src/datatypes/src/vectors/json/builder.rs
+++ b/src/datatypes/src/vectors/json/builder.rs
@@ -1,485 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::any::Any;
 use std::collections::HashMap;
 use snafu::OptionExt;
 use crate::data_type::ConcreteDataType;
 use crate::error::{Result, TryFromValueSnafu, UnsupportedOperationSnafu};
 use crate::prelude::{ValueRef, Vector, VectorRef};
 use crate::types::JsonType;
 use crate::value::StructValueRef;
 use crate::vectors::{MutableVector, StructVectorBuilder};
 struct JsonStructsBuilder {
    json_type: JsonType,
    inner: StructVectorBuilder,
 }
 impl JsonStructsBuilder {
    fn new(json_type: JsonType, capacity: usize) -> Self {
        let struct_type = json_type.as_struct_type();
        let inner = StructVectorBuilder::with_type_and_capacity(struct_type, capacity);
        Self { json_type, inner }
    }
    fn len(&self) -> usize {
        self.inner.len()
    }
    fn push(&mut self, value: &ValueRef) -> Result<()> {
        if self.json_type.is_plain_json() {
            let value = ValueRef::Struct(StructValueRef::RefList {
                val: vec![value.clone()],
                fields: self.json_type.as_struct_type(),
            });
            self.inner.try_push_value_ref(&value)
        } else {
            self.inner.try_push_value_ref(value)
        }
    }
    /// Try to merge (and consume the data of) other json vector builder into this one.
    /// Note that the other builder's json type must be able to be merged with this one's
    /// (this one's json type has all the fields in other one's, and no datatypes conflict).
    /// Normally this is guaranteed, as long as json values are pushed through [JsonVectorBuilder].
    fn try_merge(&mut self, other: &mut JsonStructsBuilder) -> Result<()> {
        debug_assert!(self.json_type.is_mergeable(&other.json_type));
        fn helper(this: &mut StructVectorBuilder, that: &mut StructVectorBuilder) -> Result<()> {
            let that_len = that.len();
            if let Some(x) = that.mut_null_buffer().finish() {
                this.mut_null_buffer().append_buffer(&x)
            } else {
                this.mut_null_buffer().append_n_non_nulls(that_len);
            }
            let that_fields = that.struct_type().fields();
            let mut that_builders = that_fields
                .iter()
                .zip(that.mut_value_builders().iter_mut())
                .map(|(field, builder)| (field.name(), builder))
                .collect::<HashMap<_, _>>();
            for (field, this_builder) in this
                .struct_type()
                .fields()
                .iter()
                .zip(this.mut_value_builders().iter_mut())
            {
                if let Some(that_builder) = that_builders.get_mut(field.name()) {
                    if field.data_type().is_struct() {
                        let this = this_builder
                            .as_mut_any()
                            .downcast_mut::<StructVectorBuilder>()
                            // Safety: a struct datatype field must be corresponding to a struct vector builder.
                            .unwrap();
                        let that = that_builder
                            .as_mut_any()
                            .downcast_mut::<StructVectorBuilder>()
                            // Safety: other builder with same field name must have same datatype,
                            // ensured because the two json types are mergeable.
                            .unwrap();
                        helper(this, that)?;
                    } else {
                        let vector = that_builder.to_vector();
                        this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?;
                    }
                } else {
                    this_builder.push_nulls(that_len);
                }
            }
            Ok(())
        }
        helper(&mut self.inner, &mut other.inner)
    }
    /// Same as [JsonStructsBuilder::try_merge], but does not consume the other builder's data.
    fn try_merge_cloned(&mut self, other: &JsonStructsBuilder) -> Result<()> {
        debug_assert!(self.json_type.is_mergeable(&other.json_type));
        fn helper(this: &mut StructVectorBuilder, that: &StructVectorBuilder) -> Result<()> {
            let that_len = that.len();
            if let Some(x) = that.null_buffer().finish_cloned() {
                this.mut_null_buffer().append_buffer(&x)
            } else {
                this.mut_null_buffer().append_n_non_nulls(that_len);
            }
            let that_fields = that.struct_type().fields();
            let that_builders = that_fields
                .iter()
                .zip(that.value_builders().iter())
                .map(|(field, builder)| (field.name(), builder))
                .collect::<HashMap<_, _>>();
            for (field, this_builder) in this
                .struct_type()
                .fields()
                .iter()
                .zip(this.mut_value_builders().iter_mut())
            {
                if let Some(that_builder) = that_builders.get(field.name()) {
                    if field.data_type().is_struct() {
                        let this = this_builder
                            .as_mut_any()
                            .downcast_mut::<StructVectorBuilder>()
                            // Safety: a struct datatype field must be corresponding to a struct vector builder.
                            .unwrap();
                        let that = that_builder
                            .as_any()
                            .downcast_ref::<StructVectorBuilder>()
                            // Safety: other builder with same field name must have same datatype,
                            // ensured because the two json types are mergeable.
                            .unwrap();
                        helper(this, that)?;
                    } else {
                        let vector = that_builder.to_vector_cloned();
                        this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?;
                    }
                } else {
                    this_builder.push_nulls(that_len);
                }
            }
            Ok(())
        }
        helper(&mut self.inner, &other.inner)
    }
 }
 /// The vector builder for json type values.
 ///
 /// Json type are dynamic, to some degree (as long as they can be merged into each other). So are
 /// json values. Json values are physically stored in struct vectors, which require the types of
 /// struct values to be fixed inside a certain struct vector. So to resolve "dynamic" vs "fixed"
 /// datatype problem, in this builder, each type of json value gets its own struct vector builder.
 /// Once new json type value is pushing into this builder, it creates a new "child" builder for it.
 ///
 /// Given the "mixed" nature of the values stored in this builder, to produce the json vector, a
 /// "merge" operation is performed. The "merge" is to iterate over all the "child" builders, and fill
 /// nulls for missing json fields. The final vector's json type is fixed to be the "merge" of all
 /// pushed json types.
 pub(crate) struct JsonVectorBuilder {
    merged_type: JsonType,
    capacity: usize,
    builders: Vec<JsonStructsBuilder>,
 }
 impl JsonVectorBuilder {
    pub(crate) fn with_capacity(capacity: usize) -> Self {
        Self {
            merged_type: JsonType::empty(),
            capacity,
            builders: vec![],
        }
    }
    fn try_create_new_builder(&mut self, json_type: &JsonType) -> Result<&mut JsonStructsBuilder> {
        self.merged_type.merge(json_type)?;
        let builder = JsonStructsBuilder::new(json_type.clone(), self.capacity);
        self.builders.push(builder);
        let len = self.builders.len();
        Ok(&mut self.builders[len - 1])
    }
 }
 impl MutableVector for JsonVectorBuilder {
    fn data_type(&self) -> ConcreteDataType {
        ConcreteDataType::Json(self.merged_type.clone())
    }
    fn len(&self) -> usize {
        self.builders.iter().map(|x| x.len()).sum()
    }
    fn as_any(&self) -> &dyn Any {
        self
    }
    fn as_mut_any(&mut self) -> &mut dyn Any {
        self
    }
    fn to_vector(&mut self) -> VectorRef {
        // Fast path:
        if self.builders.len() == 1 {
            return self.builders[0].inner.to_vector();
        }
        let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity);
        for builder in self.builders.iter_mut() {
            unified_jsons
                .try_merge(builder)
                // Safety: the "unified_jsons" has the merged json type from all the builders,
                // so it should merge them without errors.
                .unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}"));
        }
        unified_jsons.inner.to_vector()
    }
    fn to_vector_cloned(&self) -> VectorRef {
        // Fast path:
        if self.builders.len() == 1 {
            return self.builders[0].inner.to_vector_cloned();
        }
        let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity);
        for builder in self.builders.iter() {
            unified_jsons
                .try_merge_cloned(builder)
                // Safety: the "unified_jsons" has the merged json type from all the builders,
                // so it should merge them without errors.
                .unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}"));
        }
        unified_jsons.inner.to_vector_cloned()
    }
    fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
        let data_type = value.data_type();
        let json_type = data_type.as_json().with_context(|| TryFromValueSnafu {
            reason: format!("expected json value, got {value:?}"),
        })?;
        let builder = match self.builders.last_mut() {
            Some(last) => {
                if &last.json_type != json_type {
                    self.try_create_new_builder(json_type)?
                } else {
                    last
                }
            }
            None => self.try_create_new_builder(json_type)?,
        };
        let ValueRef::Json(value) = value else {
            // Safety: json datatype value must be the value of json.
            unreachable!()
        };
        builder.push(value)
    }
    fn push_null(&mut self) {
        let null_json_value = ValueRef::Json(Box::new(ValueRef::Null));
        self.try_push_value_ref(&null_json_value)
            // Safety: learning from the method "try_push_value_ref", a null json value should be
            // always able to push into any json vectors.
            .unwrap_or_else(|e| {
                panic!("failed to push null json value: {null_json_value:?}, error: {e}")
            });
    }
    fn extend_slice_of(&mut self, _: &dyn Vector, _: usize, _: usize) -> Result<()> {
        UnsupportedOperationSnafu {
            op: "extend_slice_of",
            vector_type: "JsonVector",
        }
        .fail()
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::data_type::DataType;
    use crate::json::JsonStructureSettings;
    use crate::vectors::helper::pretty_print;
    fn push(json: &str, builder: &mut JsonVectorBuilder, expected: std::result::Result<(), &str>) {
        let settings = JsonStructureSettings::Structured(None);
        let json: serde_json::Value = serde_json::from_str(json).unwrap();
        let value = settings.encode(json).unwrap();
        let value = value.as_value_ref();
        let result = builder.try_push_value_ref(&value);
        match (result, expected) {
            (Ok(()), Ok(())) => (),
            (Err(e), Err(expected)) => assert_eq!(e.to_string(), expected),
            _ => unreachable!(),
        }
    }
    #[test]
    fn test_push_plain_jsons() -> Result<()> {
        let jsons = vec!["1", "2", r#""s""#, "[true]"];
        let results = vec![
            Ok(()),
            Ok(()),
            Err(
                "Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: String",
            ),
            Err(
                "Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: List<Boolean>",
            ),
        ];
        let mut builder = JsonVectorBuilder::with_capacity(1);
        for (json, result) in jsons.into_iter().zip(results.into_iter()) {
            push(json, &mut builder, result);
        }
        let vector = builder.to_vector();
        let expected = r#"
 +----------------+
 | StructVector   |
 +----------------+
 | {__plain__: 1} |
 | {__plain__: 2} |
 +----------------+"#;
        assert_eq!(pretty_print(vector), expected.trim());
        Ok(())
    }
    #[test]
    fn test_push_json_objects() -> Result<()> {
        let jsons = vec![
            r#"{
            "s": "a",
            "list": [1, 2, 3]
        }"#,
            r#"{
            "list": [4],
            "s": "b"
        }"#,
            r#"{
            "s": "c",
            "float": 0.9
        }"#,
            r#"{
            "float": 0.8,
            "s": "d"
        }"#,
            r#"{
            "float": 0.7,
            "int": -1
        }"#,
            r#"{
            "int": 0,
            "float": 0.6
        }"#,
            r#"{
            "int": 1,
            "object": {"hello": "world", "timestamp": 1761523200000}
        }"#,
            r#"{
            "object": {"hello": "greptime", "timestamp": 1761523201000},
            "int": 2
        }"#,
            r#"{
            "object": {"timestamp": 1761523202000},
            "nested": {"a": {"b": {"b": {"a": "abba"}}}}
        }"#,
            r#"{
            "nested": {"a": {"b": {"a": {"b": "abab"}}}},
            "object": {"timestamp": 1761523203000}
        }"#,
        ];
        let mut builder = JsonVectorBuilder::with_capacity(1);
        for json in jsons {
            push(json, &mut builder, Ok(()));
        }
        assert_eq!(builder.len(), 10);
        // test children builders:
        assert_eq!(builder.builders.len(), 6);
        let expect_types = [
            r#"Json<Struct<"list": List<Int64>, "s": String>>"#,
            r#"Json<Struct<"float": Float64, "s": String>>"#,
            r#"Json<Struct<"float": Float64, "int": Int64>>"#,
            r#"Json<Struct<"int": Int64, "object": Struct<"hello": String, "timestamp": Int64>>>"#,
            r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"b": Struct<"a": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
            r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
        ];
        let expect_vectors = [
            r#"
 +-------------------------+
 | StructVector            |
 +-------------------------+
 | {list: [1, 2, 3], s: a} |
 | {list: [4], s: b}       |
 +-------------------------+"#,
            r#"
 +--------------------+
 | StructVector       |
 +--------------------+
 | {float: 0.9, s: c} |
 | {float: 0.8, s: d} |
 +--------------------+"#,
            r#"
 +-----------------------+
 | StructVector          |
 +-----------------------+
 | {float: 0.7, int: -1} |
 | {float: 0.6, int: 0}  |
 +-----------------------+"#,
            r#"
 +---------------------------------------------------------------+
 | StructVector                                                  |
 +---------------------------------------------------------------+
 | {int: 1, object: {hello: world, timestamp: 1761523200000}}    |
 | {int: 2, object: {hello: greptime, timestamp: 1761523201000}} |
 +---------------------------------------------------------------+"#,
            r#"
 +------------------------------------------------------------------------+
 | StructVector                                                           |
 +------------------------------------------------------------------------+
 | {nested: {a: {b: {b: {a: abba}}}}, object: {timestamp: 1761523202000}} |
 +------------------------------------------------------------------------+"#,
            r#"
 +------------------------------------------------------------------------+
 | StructVector                                                           |
 +------------------------------------------------------------------------+
 | {nested: {a: {b: {a: {b: abab}}}}, object: {timestamp: 1761523203000}} |
 +------------------------------------------------------------------------+"#,
        ];
        for (builder, (expect_type, expect_vector)) in builder
            .builders
            .iter()
            .zip(expect_types.into_iter().zip(expect_vectors.into_iter()))
        {
            assert_eq!(builder.json_type.name(), expect_type);
            let vector = builder.inner.to_vector_cloned();
            assert_eq!(pretty_print(vector), expect_vector.trim());
        }
        // test final merged json type:
        let expected = r#"Json<Struct<"float": Float64, "int": Int64, "list": List<Int64>, "nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>, "b": Struct<"a": String>>>>, "object": Struct<"hello": String, "timestamp": Int64>, "s": String>>"#;
        assert_eq!(builder.data_type().to_string(), expected);
        // test final produced vector:
        let expected = r#"
 +-------------------------------------------------------------------------------------------------------------------+
 | StructVector                                                                                                      |
 +-------------------------------------------------------------------------------------------------------------------+
 | {float: , int: , list: [1, 2, 3], nested: , object: , s: a}                                                       |
 | {float: , int: , list: [4], nested: , object: , s: b}                                                             |
 | {float: 0.9, int: , list: , nested: , object: , s: c}                                                             |
 | {float: 0.8, int: , list: , nested: , object: , s: d}                                                             |
 | {float: 0.7, int: -1, list: , nested: , object: , s: }                                                            |
 | {float: 0.6, int: 0, list: , nested: , object: , s: }                                                             |
 | {float: , int: 1, list: , nested: , object: {hello: world, timestamp: 1761523200000}, s: }                        |
 | {float: , int: 2, list: , nested: , object: {hello: greptime, timestamp: 1761523201000}, s: }                     |
 | {float: , int: , list: , nested: {a: {b: {a: , b: {a: abba}}}}, object: {hello: , timestamp: 1761523202000}, s: } |
 | {float: , int: , list: , nested: {a: {b: {a: {b: abab}, b: }}}, object: {hello: , timestamp: 1761523203000}, s: } |
 +-------------------------------------------------------------------------------------------------------------------+"#;
        let vector = builder.to_vector_cloned();
        assert_eq!(pretty_print(vector), expected.trim());
        let vector = builder.to_vector();
        assert_eq!(pretty_print(vector), expected.trim());
        Ok(())
    }
 }
--- a/src/datatypes/src/vectors/struct_vector.rs
+++ b/src/datatypes/src/vectors/struct_vector.rs
@@ -323,26 +323,6 @@ impl StructVectorBuilder {
        }
        self.null_buffer.append_null();
    }
    pub(crate) fn struct_type(&self) -> &StructType {
        &self.fields
    }
    pub(crate) fn value_builders(&self) -> &[Box<dyn MutableVector>] {
        &self.value_builders
    }
    pub(crate) fn mut_value_builders(&mut self) -> &mut [Box<dyn MutableVector>] {
        &mut self.value_builders
    }
    pub(crate) fn null_buffer(&self) -> &NullBufferBuilder {
        &self.null_buffer
    }
    pub(crate) fn mut_null_buffer(&mut self) -> &mut NullBufferBuilder {
        &mut self.null_buffer
    }
 }
 impl MutableVector for StructVectorBuilder {
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -21,7 +21,6 @@ use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};
 use api::v1::{RowDeleteRequest, RowDeleteRequests, RowInsertRequest, RowInsertRequests};
 use common_base::memory_limit::MemoryLimit;
 use common_config::Configurable;
 use common_error::ext::BoxedError;
 use common_meta::key::TableMetadataManagerRef;
@@ -133,7 +132,6 @@ impl Default for FlownodeOptions {
            query: QueryOptions {
                parallelism: 1,
                allow_query_fallback: false,
                memory_pool_size: MemoryLimit::default(),
            },
            user_provider: None,
            memory: MemoryOptions::default(),
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -23,7 +23,7 @@ use api::v1::query_request::Query;
 use api::v1::{CreateTableExpr, QueryRequest};
 use client::{Client, Database};
 use common_error::ext::{BoxedError, ErrorExt};
-use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_tls_config};
+use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
 use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
 use common_meta::peer::Peer;
 use common_meta::rpc::store::RangeRequest;
@@ -123,10 +123,12 @@ impl FrontendClient {
                let cfg = ChannelConfig::new()
                    .connect_timeout(batch_opts.grpc_conn_timeout)
                    .timeout(batch_opts.query_timeout);
-
+                if let Some(tls) = &batch_opts.frontend_tls {
-                let tls_config = load_tls_config(batch_opts.frontend_tls.as_ref())
+                    let cfg = cfg.client_tls_config(tls.clone());
-                    .context(InvalidClientConfigSnafu)?;
+                    ChannelManager::with_tls_config(cfg).context(InvalidClientConfigSnafu)?
-                ChannelManager::with_config(cfg, tls_config)
+                } else {
                    ChannelManager::with_config(cfg)
                }
            },
            auth,
            query,
--- a/src/frontend/src/instance/jaeger.rs
+++ b/src/frontend/src/instance/jaeger.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::sync::Arc;
 use async_trait::async_trait;
@@ -28,7 +28,6 @@ use common_function::scalars::udf::create_udf;
 use common_query::{Output, OutputData};
 use common_recordbatch::adapter::RecordBatchStreamAdapter;
 use common_recordbatch::util;
 use common_telemetry::warn;
 use datafusion::dataframe::DataFrame;
 use datafusion::execution::SessionStateBuilder;
 use datafusion::execution::context::SessionContext;
@@ -43,9 +42,8 @@ use servers::error::{
 };
 use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams};
 use servers::otlp::trace::{
-    DURATION_NANO_COLUMN, KEY_OTEL_STATUS_ERROR_KEY, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN,
+    DURATION_NANO_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_KIND_COLUMN,
-    SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_ERROR,
+    SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, TIMESTAMP_COLUMN, TRACE_ID_COLUMN,
    TIMESTAMP_COLUMN, TRACE_ID_COLUMN,
 };
 use servers::query_handler::JaegerQueryHandler;
 use session::context::QueryContextRef;
@@ -265,7 +263,7 @@ impl JaegerQueryHandler for Instance {
            self.query_engine(),
            vec![wildcard()],
            filters,
-            vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
+            vec![],
            None,
            None,
            vec![],
@@ -324,7 +322,6 @@ async fn query_trace_table(
        })?;
    let is_data_model_v1 = table
        .clone()
        .table_info()
        .meta
        .options
@@ -333,14 +330,6 @@ async fn query_trace_table(
        .map(|s| s.as_str())
        == Some(TABLE_DATA_MODEL_TRACE_V1);
    // collect to set
    let col_names = table
        .table_info()
        .meta
        .field_column_names()
        .map(|s| format!("\"{}\"", s))
        .collect::<HashSet<String>>();
    let df_context = create_df_context(query_engine)?;
    let dataframe = df_context
@@ -353,7 +342,7 @@ async fn query_trace_table(
    let dataframe = filters
        .into_iter()
        .chain(tags.map_or(Ok(vec![]), |t| {
-            tags_filters(&dataframe, t, is_data_model_v1, &col_names)
+            tags_filters(&dataframe, t, is_data_model_v1)
        })?)
        .try_fold(dataframe, |df, expr| {
            df.filter(expr).context(DataFusionSnafu)
@@ -483,73 +472,23 @@ fn json_tag_filters(
    Ok(filters)
 }
-/// Helper function to check if span_key or resource_key exists in col_names and create an expression.
+fn flatten_tag_filters(tags: HashMap<String, JsonValue>) -> ServerResult<Vec<Expr>> {
 /// If neither exists, logs a warning and returns None.
 #[inline]
 fn check_col_and_build_expr<F>(
    span_key: String,
    resource_key: String,
    key: &str,
    col_names: &HashSet<String>,
    expr_builder: F,
 ) -> Option<Expr>
 where
    F: FnOnce(String) -> Expr,
 {
    if col_names.contains(&span_key) {
        return Some(expr_builder(span_key));
    }
    if col_names.contains(&resource_key) {
        return Some(expr_builder(resource_key));
    }
    warn!("tag key {} not found in table columns", key);
    None
 }
 fn flatten_tag_filters(
    tags: HashMap<String, JsonValue>,
    col_names: &HashSet<String>,
 ) -> ServerResult<Vec<Expr>> {
    let filters = tags
        .into_iter()
        .filter_map(|(key, value)| {
-            if key == KEY_OTEL_STATUS_ERROR_KEY && value == JsonValue::Bool(true) {
+            let key = format!("\"span_attributes.{}\"", key);
                return Some(col(SPAN_STATUS_CODE).eq(lit(SPAN_STATUS_ERROR)));
            }
            // TODO(shuiyisong): add more precise mapping from key to col name
            let span_key = format!("\"span_attributes.{}\"", key);
            let resource_key = format!("\"resource_attributes.{}\"", key);
            match value {
-                JsonValue::String(value) => {
+                JsonValue::String(value) => Some(col(key).eq(lit(value))),
                    check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
                        col(k).eq(lit(value))
                    })
                }
                JsonValue::Number(value) => {
                    if value.is_f64() {
                        // safe to unwrap as checked previously
-                        let value = value.as_f64().unwrap();
+                        Some(col(key).eq(lit(value.as_f64().unwrap())))
                        check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
                            col(k).eq(lit(value))
                        })
                    } else {
-                        let value = value.as_i64().unwrap();
+                        Some(col(key).eq(lit(value.as_i64().unwrap())))
                        check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
                            col(k).eq(lit(value))
                        })
                    }
                }
-                JsonValue::Bool(value) => {
+                JsonValue::Bool(value) => Some(col(key).eq(lit(value))),
-                    check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
+                JsonValue::Null => Some(col(key).is_null()),
                        col(k).eq(lit(value))
                    })
                }
                JsonValue::Null => {
                    check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
                        col(k).is_null()
                    })
                }
                // not supported at the moment
                JsonValue::Array(_value) => None,
                JsonValue::Object(_value) => None,
@@ -563,10 +502,9 @@ fn tags_filters(
    dataframe: &DataFrame,
    tags: HashMap<String, JsonValue>,
    is_data_model_v1: bool,
    col_names: &HashSet<String>,
 ) -> ServerResult<Vec<Expr>> {
    if is_data_model_v1 {
-        flatten_tag_filters(tags, col_names)
+        flatten_tag_filters(tags)
    } else {
        json_tag_filters(dataframe, tags)
    }
--- a/src/meta-client/examples/meta_client.rs
+++ b/src/meta-client/examples/meta_client.rs
@@ -36,7 +36,7 @@ async fn run() {
        .timeout(Duration::from_secs(3))
        .connect_timeout(Duration::from_secs(5))
        .tcp_nodelay(true);
-    let channel_manager = ChannelManager::with_config(config, None);
+    let channel_manager = ChannelManager::with_config(config);
    let mut meta_client = MetaClientBuilder::datanode_default_options(id)
        .channel_manager(channel_manager)
        .build();
--- a/src/meta-client/src/lib.rs
+++ b/src/meta-client/src/lib.rs
@@ -101,7 +101,7 @@ pub async fn create_meta_client(
    if let MetaClientType::Frontend = client_type {
        let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
-        builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config, None));
+        builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config));
        if let Some(plugins) = plugins {
            let region_follower = plugins.get::<RegionFollowerClientRef>();
            if let Some(region_follower) = region_follower {
@@ -112,8 +112,8 @@ pub async fn create_meta_client(
    }
    builder = builder
-        .channel_manager(ChannelManager::with_config(base_config, None))
+        .channel_manager(ChannelManager::with_config(base_config))
-        .heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config, None));
+        .heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config));
    let mut meta_client = builder.build();
--- a/src/meta-srv/Cargo.toml
+++ b/src/meta-srv/Cargo.toml
@@ -72,10 +72,7 @@ serde.workspace = true
 serde_json.workspace = true
 servers.workspace = true
 snafu.workspace = true
-sqlx = { workspace = true, features = [
+sqlx = { workspace = true, optional = true }
    "mysql",
    "chrono",
 ], optional = true }
 store-api.workspace = true
 strum.workspace = true
 table.workspace = true
--- a/src/meta-srv/src/election/rds/mysql.rs
+++ b/src/meta-srv/src/election/rds/mysql.rs
@@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
-use common_telemetry::{error, info, warn};
+use common_telemetry::{error, warn};
 use common_time::Timestamp;
 use snafu::{OptionExt, ResultExt, ensure};
 use sqlx::mysql::{MySqlArguments, MySqlRow};
@@ -645,13 +645,6 @@ impl Election for MySqlElection {
    }
    async fn reset_campaign(&self) {
        info!("Resetting campaign");
        if self.is_leader.load(Ordering::Relaxed) {
            if let Err(err) = self.step_down_without_lock().await {
                error!(err; "Failed to step down without lock");
            }
            info!("Step down without lock successfully, due to reset campaign");
        }
        if let Err(err) = self.client.lock().await.reset_client().await {
            error!(err; "Failed to reset client");
        }
--- a/src/meta-srv/src/election/rds/postgres.rs
+++ b/src/meta-srv/src/election/rds/postgres.rs
@@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
-use common_telemetry::{error, info, warn};
+use common_telemetry::{error, warn};
 use common_time::Timestamp;
 use deadpool_postgres::{Manager, Pool};
 use snafu::{OptionExt, ResultExt, ensure};
@@ -477,13 +477,6 @@ impl Election for PgElection {
    }
    async fn reset_campaign(&self) {
        info!("Resetting campaign");
        if self.is_leader.load(Ordering::Relaxed) {
            if let Err(err) = self.step_down_without_lock().await {
                error!(err; "Failed to step down without lock");
            }
            info!("Step down without lock successfully, due to reset campaign");
        }
        if let Err(err) = self.pg_client.write().await.reset_client().await {
            error!(err; "Failed to reset client");
        }
@@ -781,12 +774,16 @@ impl PgElection {
            key: key.clone(),
            ..Default::default()
        };
-        send_leader_change_and_set_flags(
+        if self
-            &self.is_leader,
+            .is_leader
-            &self.leader_infancy,
+            .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
-            &self.leader_watcher,
+            .is_ok()
-            LeaderChangeMessage::StepDown(Arc::new(leader_key)),
+            && let Err(e) = self
-        );
+                .leader_watcher
                .send(LeaderChangeMessage::StepDown(Arc::new(leader_key)))
        {
            error!(e; "Failed to send leader change message");
        }
        Ok(())
    }
--- a/src/meta-srv/src/handler/region_lease_handler.rs
+++ b/src/meta-srv/src/handler/region_lease_handler.rs
@@ -19,7 +19,6 @@ use api::v1::meta::{HeartbeatRequest, RegionLease, Role};
 use async_trait::async_trait;
 use common_meta::key::TableMetadataManagerRef;
 use common_meta::region_keeper::MemoryRegionKeeperRef;
 use common_telemetry::error;
 use store_api::region_engine::GrantedRegion;
 use store_api::storage::RegionId;
@@ -84,44 +83,36 @@ impl HeartbeatHandler for RegionLeaseHandler {
        let regions = stat.regions();
        let datanode_id = stat.id;
-        match self
+        let RenewRegionLeasesResponse {
            non_exists,
            renewed,
        } = self
            .region_lease_keeper
            .renew_region_leases(datanode_id, &regions)
-            .await
+            .await?;
        {
            Ok(RenewRegionLeasesResponse {
                non_exists,
                renewed,
            }) => {
                let renewed = if let Some(renewer) = &self.customized_region_lease_renewer {
                    renewer
                        .renew(ctx, renewed)
                        .into_iter()
                        .map(|region| region.into())
                        .collect()
                } else {
                    renewed
                        .into_iter()
                        .map(|(region_id, region_lease_info)| {
                            GrantedRegion::new(region_id, region_lease_info.role).into()
                        })
                        .collect::<Vec<_>>()
                };
-                acc.region_lease = Some(RegionLease {
+        let renewed = if let Some(renewer) = &self.customized_region_lease_renewer {
-                    regions: renewed,
+            renewer
-                    duration_since_epoch: req.duration_since_epoch,
+                .renew(ctx, renewed)
-                    lease_seconds: self.region_lease_seconds,
+                .into_iter()
-                    closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(),
+                .map(|region| region.into())
-                });
+                .collect()
-                acc.inactive_region_ids = non_exists;
+        } else {
-            }
+            renewed
-            Err(e) => {
+                .into_iter()
-                error!(e; "Failed to renew region leases for datanode: {datanode_id:?}, regions: {:?}", regions);
+                .map(|(region_id, region_lease_info)| {
-                // If we throw error here, the datanode will be marked as failure by region failure handler.
+                    GrantedRegion::new(region_id, region_lease_info.role).into()
-                // So we only log the error and continue.
+                })
-            }
+                .collect::<Vec<_>>()
-        }
+        };
        acc.region_lease = Some(RegionLease {
            regions: renewed,
            duration_since_epoch: req.duration_since_epoch,
            lease_seconds: self.region_lease_seconds,
            closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(),
        });
        acc.inactive_region_ids = non_exists;
        Ok(HandleControl::Continue)
    }
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -375,14 +375,12 @@ pub struct MetasrvNodeInfo {
    // The node total cpu millicores
    #[serde(default)]
    pub total_cpu_millicores: i64,
    // The node total memory bytes
    #[serde(default)]
    // The node total memory bytes
    pub total_memory_bytes: i64,
    /// The node build cpu usage millicores
    #[serde(default)]
    pub cpu_usage_millicores: i64,
    /// The node build memory usage bytes
    #[serde(default)]
    pub memory_usage_bytes: i64,
    // The node hostname
    #[serde(default)]
@@ -860,18 +858,3 @@ impl Metasrv {
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use crate::metasrv::MetasrvNodeInfo;
    #[test]
    fn test_deserialize_metasrv_node_info() {
        let str = r#"{"addr":"127.0.0.1:4002","version":"0.1.0","git_commit":"1234567890","start_time_ms":1715145600}"#;
        let node_info: MetasrvNodeInfo = serde_json::from_str(str).unwrap();
        assert_eq!(node_info.addr, "127.0.0.1:4002");
        assert_eq!(node_info.version, "0.1.0");
        assert_eq!(node_info.git_commit, "1234567890");
        assert_eq!(node_info.start_time_ms, 1715145600);
    }
 }
--- a/src/meta-srv/src/metasrv/builder.rs
+++ b/src/meta-srv/src/metasrv/builder.rs
@@ -373,8 +373,7 @@ impl MetasrvBuilder {
                runtime_switch_manager.clone(),
                meta_peer_client.clone(),
                leader_cached_kv_backend.clone(),
-            )
+            );
            .with_state(state.clone());
            Some(RegionFailureHandler::new(
                region_supervisor,
--- a/src/meta-srv/src/mocks.rs
+++ b/src/meta-srv/src/mocks.rs
@@ -134,7 +134,7 @@ pub async fn mock(
        .timeout(Duration::from_secs(10))
        .connect_timeout(Duration::from_secs(10))
        .tcp_nodelay(true);
-    let channel_manager = ChannelManager::with_config(config, None);
+    let channel_manager = ChannelManager::with_config(config);
    // Move client to an option so we can _move_ the inner value
    // on the first attempt to connect. All other attempts will fail.
--- a/src/meta-srv/src/procedure/region_migration.rs
+++ b/src/meta-srv/src/procedure/region_migration.rs
@@ -41,7 +41,7 @@ use common_meta::key::table_route::TableRouteValue;
 use common_meta::key::topic_region::{ReplayCheckpoint, TopicRegionKey};
 use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
 use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef};
-use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock};
+use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock};
 use common_meta::peer::Peer;
 use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard};
 use common_procedure::error::{
@@ -231,6 +231,8 @@ pub struct VolatileContext {
    /// the corresponding [RegionRoute](common_meta::rpc::router::RegionRoute) of the opening region
    /// was written into [TableRouteValue](common_meta::key::table_route::TableRouteValue).
    opening_region_guard: Option<OperatingRegionGuard>,
    /// `table_route` is stored via previous steps for future use.
    table_route: Option<DeserializedValueWithBytes<TableRouteValue>>,
    /// `datanode_table` is stored via previous steps for future use.
    from_peer_datanode_table: Option<DatanodeTableValue>,
    /// `table_info` is stored via previous steps for future use.
@@ -397,23 +399,29 @@ impl Context {
    /// Retry:
    /// - Failed to retrieve the metadata of table.
    pub async fn get_table_route_value(
-        &self,
+        &mut self,
-    ) -> Result<DeserializedValueWithBytes<TableRouteValue>> {
+    ) -> Result<&DeserializedValueWithBytes<TableRouteValue>> {
-        let table_id = self.persistent_ctx.region_id.table_id();
+        let table_route_value = &mut self.volatile_ctx.table_route;
        let table_route = self
            .table_metadata_manager
            .table_route_manager()
            .table_route_storage()
            .get_with_raw_bytes(table_id)
            .await
            .context(error::TableMetadataManagerSnafu)
            .map_err(BoxedError::new)
            .with_context(|_| error::RetryLaterWithSourceSnafu {
                reason: format!("Failed to get TableRoute: {table_id}"),
            })?
            .context(error::TableRouteNotFoundSnafu { table_id })?;
-        Ok(table_route)
+        if table_route_value.is_none() {
            let table_id = self.persistent_ctx.region_id.table_id();
            let table_route = self
                .table_metadata_manager
                .table_route_manager()
                .table_route_storage()
                .get_with_raw_bytes(table_id)
                .await
                .context(error::TableMetadataManagerSnafu)
                .map_err(BoxedError::new)
                .with_context(|_| error::RetryLaterWithSourceSnafu {
                    reason: format!("Failed to get TableRoute: {table_id}"),
                })?
                .context(error::TableRouteNotFoundSnafu { table_id })?;
            *table_route_value = Some(table_route);
        }
        Ok(table_route_value.as_ref().unwrap())
    }
    /// Notifies the RegionSupervisor to register failure detectors of failed region.
@@ -455,6 +463,12 @@ impl Context {
            .await;
    }
    /// Removes the `table_route` of [VolatileContext], returns true if any.
    pub fn remove_table_route_value(&mut self) -> bool {
        let value = self.volatile_ctx.table_route.take();
        value.is_some()
    }
    /// Returns the `table_info` of [VolatileContext] if any.
    /// Otherwise, returns the value retrieved from remote.
    ///
@@ -649,13 +663,14 @@ impl RegionMigrationProcedure {
        })
    }
-    async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> {
+    async fn rollback_inner(&mut self) -> Result<()> {
        let _timer = METRIC_META_REGION_MIGRATION_EXECUTE
            .with_label_values(&["rollback"])
            .start_timer();
        let table_id = self.context.region_id().table_id();
        let region_id = self.context.region_id();
        self.context.remove_table_route_value();
        let table_metadata_manager = self.context.table_metadata_manager.clone();
        let table_route = self.context.get_table_route_value().await?;
@@ -668,11 +683,9 @@ impl RegionMigrationProcedure {
            .any(|route| route.is_leader_downgrading());
        if downgraded {
            let table_lock = TableLock::Write(region_id.table_id()).into();
            let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
            info!("Rollbacking downgraded region leader table route, region: {region_id}");
            table_metadata_manager
-                    .update_leader_region_status(table_id, &table_route, |route| {
+                    .update_leader_region_status(table_id, table_route, |route| {
                        if route.region.id == region_id {
                            Some(None)
                        } else {
@@ -685,9 +698,6 @@ impl RegionMigrationProcedure {
                    .with_context(|_| error::RetryLaterWithSourceSnafu {
                        reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
                    })?;
            self.context
                .deregister_failure_detectors_for_candidate_region()
                .await;
        }
        self.context.register_failure_detectors().await;
@@ -702,8 +712,8 @@ impl Procedure for RegionMigrationProcedure {
        Self::TYPE_NAME
    }
-    async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> {
+    async fn rollback(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<()> {
-        self.rollback_inner(ctx)
+        self.rollback_inner()
            .await
            .map_err(ProcedureError::external)
    }
--- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
@@ -13,7 +13,6 @@
 // limitations under the License.
 use std::any::Any;
 use std::ops::Div;
 use std::time::Duration;
 use api::v1::meta::MailboxMessage;
@@ -140,15 +139,12 @@ impl OpenCandidateRegion {
            input: open_instruction.to_string(),
        })?;
        let operation_timeout =
            ctx.next_operation_timeout()
                .context(error::ExceededDeadlineSnafu {
                    operation: "Open candidate region",
                })?;
        let operation_timeout = operation_timeout.div(2).max(OPEN_CANDIDATE_REGION_TIMEOUT);
        let ch = Channel::Datanode(candidate.id);
        let now = Instant::now();
-        let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?;
+        let receiver = ctx
            .mailbox
            .send(&ch, msg, OPEN_CANDIDATE_REGION_TIMEOUT)
            .await?;
        match receiver.await {
            Ok(msg) => {
--- a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs
@@ -46,7 +46,7 @@ impl UpdateMetadata {
        // TODO(weny): ensures the leader region peer is the `from_peer`.
        if let Err(err) = table_metadata_manager
-            .update_leader_region_status(table_id, &current_table_route_value, |route| {
+            .update_leader_region_status(table_id, current_table_route_value, |route| {
                if route.region.id == region_id
                    && route
                        .leader_peer
@@ -61,6 +61,7 @@ impl UpdateMetadata {
            .await
            .context(error::TableMetadataManagerSnafu)
        {
            ctx.remove_table_route_value();
            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
                reason: format!(
                    "Failed to update the table route during the downgrading leader region, region_id: {region_id}, from_peer_id: {from_peer_id}"
@@ -68,6 +69,8 @@ impl UpdateMetadata {
            });
        }
        ctx.remove_table_route_value();
        Ok(())
    }
 }
@@ -78,7 +81,7 @@ mod tests {
    use common_meta::key::test_utils::new_test_table_info;
    use common_meta::peer::Peer;
-    use common_meta::rpc::router::{Region, RegionRoute};
+    use common_meta::rpc::router::{LeaderState, Region, RegionRoute};
    use store_api::storage::RegionId;
    use crate::error::Error;
@@ -112,6 +115,63 @@ mod tests {
        assert!(!err.is_retryable());
    }
    #[tokio::test]
    async fn test_failed_to_update_table_route_error() {
        let state = UpdateMetadata::Downgrade;
        let persistent_context = new_persistent_context();
        let from_peer = persistent_context.from_peer.clone();
        let env = TestingEnv::new();
        let mut ctx = env.context_factory().new_context(persistent_context);
        let table_id = ctx.region_id().table_id();
        let table_info = new_test_table_info(1024, vec![1, 2]).into();
        let region_routes = vec![
            RegionRoute {
                region: Region::new_test(RegionId::new(1024, 1)),
                leader_peer: Some(from_peer.clone()),
                ..Default::default()
            },
            RegionRoute {
                region: Region::new_test(RegionId::new(1024, 2)),
                leader_peer: Some(Peer::empty(4)),
                ..Default::default()
            },
        ];
        env.create_physical_table_metadata(table_info, region_routes)
            .await;
        let table_metadata_manager = env.table_metadata_manager();
        let original_table_route = table_metadata_manager
            .table_route_manager()
            .table_route_storage()
            .get_with_raw_bytes(table_id)
            .await
            .unwrap()
            .unwrap();
        // modifies the table route.
        table_metadata_manager
            .update_leader_region_status(table_id, &original_table_route, |route| {
                if route.region.id == RegionId::new(1024, 2) {
                    Some(Some(LeaderState::Downgrading))
                } else {
                    None
                }
            })
            .await
            .unwrap();
        // sets the old table route.
        ctx.volatile_ctx.table_route = Some(original_table_route);
        let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();
        assert!(ctx.volatile_ctx.table_route.is_none());
        assert!(err.is_retryable());
        assert!(format!("{err:?}").contains("Failed to update the table route"));
    }
    #[tokio::test]
    async fn test_only_downgrade_from_peer() {
        let mut state = Box::new(UpdateMetadata::Downgrade);
@@ -152,6 +212,7 @@ mod tests {
        // It should remain unchanged.
        assert_eq!(latest_table_route.version().unwrap(), 0);
        assert!(!latest_table_route.region_routes().unwrap()[0].is_leader_downgrading());
        assert!(ctx.volatile_ctx.table_route.is_none());
    }
    #[tokio::test]
@@ -193,5 +254,6 @@ mod tests {
            .unwrap();
        assert!(latest_table_route.region_routes().unwrap()[0].is_leader_downgrading());
        assert!(ctx.volatile_ctx.table_route.is_none());
    }
 }
--- a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs
@@ -35,7 +35,7 @@ impl UpdateMetadata {
        let current_table_route_value = ctx.get_table_route_value().await?;
        if let Err(err) = table_metadata_manager
-            .update_leader_region_status(table_id, &current_table_route_value, |route| {
+            .update_leader_region_status(table_id, current_table_route_value, |route| {
                if route.region.id == region_id {
                    Some(None)
                } else {
@@ -45,12 +45,14 @@ impl UpdateMetadata {
            .await
            .context(error::TableMetadataManagerSnafu)
        {
            ctx.remove_table_route_value();
            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
                reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
            });
        }
        ctx.register_failure_detectors().await;
        ctx.remove_table_route_value();
        Ok(())
    }
@@ -59,6 +61,7 @@ impl UpdateMetadata {
 #[cfg(test)]
 mod tests {
    use std::assert_matches::assert_matches;
    use std::sync::Arc;
    use common_meta::key::test_utils::new_test_table_info;
    use common_meta::peer::Peer;
@@ -70,6 +73,7 @@ mod tests {
    use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context};
    use crate::procedure::region_migration::update_metadata::UpdateMetadata;
    use crate::procedure::region_migration::{ContextFactory, PersistentContext, State};
    use crate::region::supervisor::RegionFailureDetectorControl;
    fn new_persistent_context() -> PersistentContext {
        test_util::new_persistent_context(1, 2, RegionId::new(1024, 1))
@@ -89,6 +93,101 @@ mod tests {
        assert!(!err.is_retryable());
    }
    #[tokio::test]
    async fn test_update_table_route_with_retry() {
        let state = UpdateMetadata::Rollback;
        let persistent_context = new_persistent_context();
        let from_peer = persistent_context.from_peer.clone();
        let env = TestingEnv::new();
        let mut ctx = env.context_factory().new_context(persistent_context);
        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
        ctx.region_failure_detector_controller = Arc::new(RegionFailureDetectorControl::new(tx));
        let table_id = ctx.region_id().table_id();
        let table_info = new_test_table_info(1024, vec![1, 2, 3]).into();
        let region_routes = vec![
            RegionRoute {
                region: Region::new_test(RegionId::new(1024, 1)),
                leader_peer: Some(from_peer.clone()),
                leader_state: Some(LeaderState::Downgrading),
                ..Default::default()
            },
            RegionRoute {
                region: Region::new_test(RegionId::new(1024, 2)),
                leader_peer: Some(Peer::empty(4)),
                leader_state: Some(LeaderState::Downgrading),
                ..Default::default()
            },
            RegionRoute {
                region: Region::new_test(RegionId::new(1024, 3)),
                leader_peer: Some(Peer::empty(5)),
                ..Default::default()
            },
        ];
        let expected_region_routes = {
            let mut region_routes = region_routes.clone();
            region_routes[0].leader_state = None;
            region_routes[1].leader_state = None;
            region_routes
        };
        env.create_physical_table_metadata(table_info, region_routes)
            .await;
        let table_metadata_manager = env.table_metadata_manager();
        let old_table_route = table_metadata_manager
            .table_route_manager()
            .table_route_storage()
            .get_with_raw_bytes(table_id)
            .await
            .unwrap()
            .unwrap();
        // modifies the table route.
        table_metadata_manager
            .update_leader_region_status(table_id, &old_table_route, |route| {
                if route.region.id == RegionId::new(1024, 2) {
                    Some(None)
                } else {
                    None
                }
            })
            .await
            .unwrap();
        ctx.volatile_ctx.table_route = Some(old_table_route);
        let err = state
            .rollback_downgraded_region(&mut ctx)
            .await
            .unwrap_err();
        assert!(ctx.volatile_ctx.table_route.is_none());
        assert!(err.is_retryable());
        assert!(format!("{err:?}").contains("Failed to update the table route"));
        assert_eq!(rx.len(), 0);
        state.rollback_downgraded_region(&mut ctx).await.unwrap();
        let event = rx.try_recv().unwrap();
        let detecting_regions = event.into_region_failure_detectors();
        assert_eq!(
            detecting_regions,
            vec![(from_peer.id, ctx.persistent_ctx.region_id)]
        );
        let table_route = table_metadata_manager
            .table_route_manager()
            .table_route_storage()
            .get(table_id)
            .await
            .unwrap()
            .unwrap();
        assert_eq!(
            &expected_region_routes,
            table_route.region_routes().unwrap()
        );
    }
    #[tokio::test]
    async fn test_next_migration_end_state() {
        let mut state = Box::new(UpdateMetadata::Rollback);
@@ -139,6 +238,8 @@ mod tests {
            .downcast_ref::<RegionMigrationAbort>()
            .unwrap();
        assert!(ctx.volatile_ctx.table_route.is_none());
        let table_route = table_metadata_manager
            .table_route_manager()
            .table_route_storage()
--- a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs
@@ -166,7 +166,7 @@ impl UpdateMetadata {
                    region_options: region_options.clone(),
                    region_wal_options: region_wal_options.clone(),
                },
-                &table_route_value,
+                table_route_value,
                region_routes,
                &region_options,
                &region_wal_options,
@@ -174,11 +174,13 @@ impl UpdateMetadata {
            .await
            .context(error::TableMetadataManagerSnafu)
        {
            ctx.remove_table_route_value();
            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
                reason: format!("Failed to update the table route during the upgrading candidate region: {region_id}"),
            });
        };
        ctx.remove_table_route_value();
        ctx.deregister_failure_detectors().await;
        // Consumes the guard.
        ctx.volatile_ctx.opening_region_guard.take();
@@ -308,6 +310,71 @@ mod tests {
        assert_eq!(new_region_routes[0].leader_peer.as_ref().unwrap().id, 2);
    }
    #[tokio::test]
    async fn test_failed_to_update_table_route_error() {
        let state = UpdateMetadata::Upgrade;
        let env = TestingEnv::new();
        let persistent_context = new_persistent_context();
        let mut ctx = env.context_factory().new_context(persistent_context);
        let opening_keeper = MemoryRegionKeeper::default();
        let table_id = 1024;
        let table_info = new_test_table_info(table_id, vec![1]).into();
        let region_routes = vec![
            RegionRoute {
                region: Region::new_test(RegionId::new(table_id, 1)),
                leader_peer: Some(Peer::empty(1)),
                follower_peers: vec![Peer::empty(5), Peer::empty(3)],
                leader_state: Some(LeaderState::Downgrading),
                leader_down_since: Some(current_time_millis()),
            },
            RegionRoute {
                region: Region::new_test(RegionId::new(table_id, 2)),
                leader_peer: Some(Peer::empty(4)),
                leader_state: Some(LeaderState::Downgrading),
                ..Default::default()
            },
        ];
        env.create_physical_table_metadata(table_info, region_routes)
            .await;
        let table_metadata_manager = env.table_metadata_manager();
        let original_table_route = table_metadata_manager
            .table_route_manager()
            .table_route_storage()
            .get_with_raw_bytes(table_id)
            .await
            .unwrap()
            .unwrap();
        // modifies the table route.
        table_metadata_manager
            .update_leader_region_status(table_id, &original_table_route, |route| {
                if route.region.id == RegionId::new(1024, 2) {
                    // Removes the status.
                    Some(None)
                } else {
                    None
                }
            })
            .await
            .unwrap();
        // sets the old table route.
        ctx.volatile_ctx.table_route = Some(original_table_route);
        let guard = opening_keeper
            .register(2, RegionId::new(table_id, 1))
            .unwrap();
        ctx.volatile_ctx.opening_region_guard = Some(guard);
        let err = state.upgrade_candidate_region(&mut ctx).await.unwrap_err();
        assert!(ctx.volatile_ctx.table_route.is_none());
        assert!(ctx.volatile_ctx.opening_region_guard.is_some());
        assert!(err.is_retryable());
        assert!(format!("{err:?}").contains("Failed to update the table route"));
    }
    #[tokio::test]
    async fn test_check_metadata() {
        let state = UpdateMetadata::Upgrade;
@@ -425,6 +492,7 @@ mod tests {
            .unwrap();
        let region_routes = table_route.region_routes().unwrap();
        assert!(ctx.volatile_ctx.table_route.is_none());
        assert!(ctx.volatile_ctx.opening_region_guard.is_none());
        assert_eq!(region_routes.len(), 1);
        assert!(!region_routes[0].is_leader_downgrading());
--- a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
@@ -17,9 +17,7 @@ use std::time::Duration;
 use api::v1::meta::MailboxMessage;
 use common_meta::ddl::utils::parse_region_wal_options;
-use common_meta::instruction::{
+use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply};
    Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply,
 };
 use common_meta::lock_key::RemoteWalLock;
 use common_meta::wal_options_allocator::extract_topic_from_wal_options;
 use common_procedure::{Context as ProcedureContext, Status};
@@ -133,19 +131,19 @@ impl UpgradeCandidateRegion {
            None
        };
-        let upgrade_instruction = Instruction::UpgradeRegions(vec![
+        let upgrade_instruction = Instruction::UpgradeRegion(
            UpgradeRegion {
                region_id,
                last_entry_id,
                metadata_last_entry_id,
-                replay_timeout,
+                replay_timeout: Some(replay_timeout),
                location_id: Some(ctx.persistent_ctx.from_peer.id),
                replay_entry_id: None,
                metadata_replay_entry_id: None,
            }
            .with_replay_entry_id(checkpoint.map(|c| c.entry_id))
            .with_metadata_replay_entry_id(checkpoint.and_then(|c| c.metadata_entry_id)),
-        ]);
+        );
        Ok(upgrade_instruction)
    }
@@ -195,7 +193,11 @@ impl UpgradeCandidateRegion {
        match receiver.await {
            Ok(msg) => {
                let reply = HeartbeatMailbox::json_reply(&msg)?;
-                let InstructionReply::UpgradeRegions(UpgradeRegionsReply { replies }) = reply
+                let InstructionReply::UpgradeRegion(UpgradeRegionReply {
                    ready,
                    exists,
                    error,
                }) = reply
                else {
                    return error::UnexpectedInstructionReplySnafu {
                        mailbox_message: msg.to_string(),
@@ -203,13 +205,6 @@ impl UpgradeCandidateRegion {
                    }
                    .fail();
                };
                // TODO(weny): handle multiple replies.
                let UpgradeRegionReply {
                    ready,
                    exists,
                    error,
                    ..
                } = &replies[0];
                // Notes: The order of handling is important.
                if error.is_some() {
--- a/src/meta-srv/src/procedure/test_util.rs
+++ b/src/meta-srv/src/procedure/test_util.rs
@@ -18,7 +18,7 @@ use api::v1::meta::mailbox_message::Payload;
 use api::v1::meta::{HeartbeatResponse, MailboxMessage};
 use common_meta::instruction::{
    DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply,
-    UpgradeRegionReply, UpgradeRegionsReply,
+    UpgradeRegionReply,
 };
 use common_meta::key::TableMetadataManagerRef;
 use common_meta::key::table_route::TableRouteValue;
@@ -212,14 +212,11 @@ pub fn new_upgrade_region_reply(
        to: "meta".to_string(),
        timestamp_millis: current_time_millis(),
        payload: Some(Payload::Json(
-            serde_json::to_string(&InstructionReply::UpgradeRegions(
+            serde_json::to_string(&InstructionReply::UpgradeRegion(UpgradeRegionReply {
-                UpgradeRegionsReply::single(UpgradeRegionReply {
+                ready,
-                    region_id: RegionId::new(0, 0),
+                exists,
-                    ready,
+                error,
-                    exists,
+            }))
                    error,
                }),
            ))
            .unwrap(),
        )),
    }
--- a/src/meta-srv/src/region/supervisor.rs
+++ b/src/meta-srv/src/region/supervisor.rs
@@ -52,7 +52,6 @@ use crate::procedure::region_migration::{
 };
 use crate::region::failure_detector::RegionFailureDetector;
 use crate::selector::SelectorOptions;
 use crate::state::StateRef;
 /// `DatanodeHeartbeat` represents the heartbeat signal sent from a datanode.
 /// It includes identifiers for the cluster and datanode, a list of regions being monitored,
@@ -101,6 +100,16 @@ pub(crate) enum Event {
    Dump(tokio::sync::oneshot::Sender<RegionFailureDetector>),
 }
 #[cfg(test)]
 impl Event {
    pub(crate) fn into_region_failure_detectors(self) -> Vec<DetectingRegion> {
        match self {
            Self::RegisterFailureDetectors(detecting_regions) => detecting_regions,
            _ => unreachable!(),
        }
    }
 }
 impl Debug for Event {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
@@ -130,9 +139,6 @@ pub struct RegionSupervisorTicker {
    /// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`].
    tick_handle: Mutex<Option<JoinHandle<()>>>,
    /// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`].
    initialization_handler: Mutex<Option<JoinHandle<()>>>,
    /// The interval of tick.
    tick_interval: Duration,
@@ -176,7 +182,6 @@ impl RegionSupervisorTicker {
        );
        Self {
            tick_handle: Mutex::new(None),
            initialization_handler: Mutex::new(None),
            tick_interval,
            initialization_delay,
            initialization_retry_period,
@@ -197,7 +202,7 @@ impl RegionSupervisorTicker {
                self.initialization_retry_period,
            );
            initialization_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
-            let initialization_handler = common_runtime::spawn_global(async move {
+            common_runtime::spawn_global(async move {
                loop {
                    initialization_interval.tick().await;
                    let (tx, rx) = oneshot::channel();
@@ -213,7 +218,6 @@ impl RegionSupervisorTicker {
                    }
                }
            });
            *self.initialization_handler.lock().unwrap() = Some(initialization_handler);
            let sender = self.sender.clone();
            let ticker_loop = tokio::spawn(async move {
@@ -243,11 +247,6 @@ impl RegionSupervisorTicker {
            handle.abort();
            info!("The tick loop is stopped.");
        }
        let initialization_handler = self.initialization_handler.lock().unwrap().take();
        if let Some(initialization_handler) = initialization_handler {
            initialization_handler.abort();
            info!("The initialization loop is stopped.");
        }
    }
 }
@@ -291,8 +290,6 @@ pub struct RegionSupervisor {
    peer_resolver: PeerResolverRef,
    /// The kv backend.
    kv_backend: KvBackendRef,
    /// The meta state, used to check if the current metasrv is the leader.
    state: Option<StateRef>,
 }
 /// Controller for managing failure detectors for regions.
@@ -376,29 +373,12 @@ impl RegionSupervisor {
            runtime_switch_manager,
            peer_resolver,
            kv_backend,
            state: None,
        }
    }
    /// Sets the meta state.
    pub(crate) fn with_state(mut self, state: StateRef) -> Self {
        self.state = Some(state);
        self
    }
    /// Runs the main loop.
    pub(crate) async fn run(&mut self) {
        while let Some(event) = self.receiver.recv().await {
            if let Some(state) = self.state.as_ref()
                && !state.read().unwrap().is_leader()
            {
                warn!(
                    "The current metasrv is not the leader, ignore {:?} event",
                    event
                );
                continue;
            }
            match event {
                Event::InitializeAllRegions(sender) => {
                    match self.is_maintenance_mode_enabled().await {
@@ -433,10 +413,7 @@ impl RegionSupervisor {
                    self.deregister_failure_detectors(detecting_regions).await
                }
                Event::HeartbeatArrived(heartbeat) => self.on_heartbeat_arrived(heartbeat),
-                Event::Clear => {
+                Event::Clear => self.clear(),
                    self.clear();
                    info!("Region supervisor is initialized.");
                }
                #[cfg(test)]
                Event::Dump(sender) => {
                    let _ = sender.send(self.failure_detector.dump());
@@ -929,7 +906,6 @@ pub(crate) mod tests {
        let (tx, mut rx) = tokio::sync::mpsc::channel(128);
        let ticker = RegionSupervisorTicker {
            tick_handle: Mutex::new(None),
            initialization_handler: Mutex::new(None),
            tick_interval: Duration::from_millis(10),
            initialization_delay: Duration::from_millis(100),
            initialization_retry_period: Duration::from_millis(100),
@@ -956,7 +932,6 @@ pub(crate) mod tests {
        let (tx, mut rx) = tokio::sync::mpsc::channel(128);
        let ticker = RegionSupervisorTicker {
            tick_handle: Mutex::new(None),
            initialization_handler: Mutex::new(None),
            tick_interval: Duration::from_millis(1000),
            initialization_delay: Duration::from_millis(50),
            initialization_retry_period: Duration::from_millis(50),
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -79,7 +79,6 @@ impl heartbeat_server::Heartbeat for Metasrv {
                        let res = handler_group
                            .handle(req, ctx.clone())
                            .await
                            .inspect_err(|e| warn!(e; "Failed to handle heartbeat request, pusher: {pusher_id:?}", ))
                            .map_err(|e| e.into());
                        is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader());
--- a/src/meta-srv/src/state.rs
+++ b/src/meta-srv/src/state.rs
@@ -75,12 +75,6 @@ impl State {
        })
    }
    /// Returns true if the current state is a leader.
    pub fn is_leader(&self) -> bool {
        matches!(self, State::Leader(_))
    }
    /// Returns true if the leader cache is enabled.
    pub fn enable_leader_cache(&self) -> bool {
        match &self {
            State::Leader(leader) => leader.enable_leader_cache,
--- a/src/metric-engine/Cargo.toml
+++ b/src/metric-engine/Cargo.toml
@@ -46,7 +46,6 @@ tracing.workspace = true
 common-meta = { workspace = true, features = ["testing"] }
 common-test-util.workspace = true
 mito2 = { workspace = true, features = ["test"] }
 common-wal = { workspace = true }
 [package.metadata.cargo-udeps.ignore]
 normal = ["aquamarine"]
--- a/src/metric-engine/src/config.rs
+++ b/src/metric-engine/src/config.rs
@@ -23,8 +23,8 @@ pub(crate) const DEFAULT_FLUSH_METADATA_REGION_INTERVAL: Duration = Duration::fr
 /// Configuration for the metric engine.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct EngineConfig {
-    /// Whether to use sparse primary key encoding.
+    /// Experimental feature to use sparse primary key encoding.
-    pub sparse_primary_key_encoding: bool,
+    pub experimental_sparse_primary_key_encoding: bool,
    /// The flush interval of the metadata region.
    #[serde(
        with = "humantime_serde",
@@ -37,7 +37,7 @@ impl Default for EngineConfig {
    fn default() -> Self {
        Self {
            flush_metadata_region_interval: DEFAULT_FLUSH_METADATA_REGION_INTERVAL,
-            sparse_primary_key_encoding: true,
+            experimental_sparse_primary_key_encoding: false,
        }
    }
 }
--- a/src/metric-engine/src/data_region.rs
+++ b/src/metric-engine/src/data_region.rs
@@ -20,7 +20,7 @@ use snafu::ResultExt;
 use store_api::metadata::ColumnMetadata;
 use store_api::region_engine::RegionEngine;
 use store_api::region_request::{
-    AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionRequest,
+    AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionPutRequest, RegionRequest,
 };
 use store_api::storage::consts::ReservedColumnId;
 use store_api::storage::{ConcreteDataType, RegionId};
@@ -183,11 +183,11 @@ impl DataRegion {
    pub async fn write_data(
        &self,
        region_id: RegionId,
-        request: RegionRequest,
+        request: RegionPutRequest,
    ) -> Result<AffectedRows> {
        let region_id = utils::to_data_region_id(region_id);
        self.mito
-            .handle_request(region_id, request)
+            .handle_request(region_id, RegionRequest::Put(request))
            .await
            .context(MitoWriteOperationSnafu)
            .map(|result| result.affected_rows)
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -37,7 +37,7 @@ use common_error::status_code::StatusCode;
 use common_runtime::RepeatedTask;
 use mito2::engine::MitoEngine;
 pub(crate) use options::IndexOptions;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 pub(crate) use state::MetricEngineState;
 use store_api::metadata::RegionMetadataRef;
 use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
@@ -46,9 +46,7 @@ use store_api::region_engine::{
    RegionStatistic, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
    SettableRegionRoleState, SyncManifestResponse,
 };
-use store_api::region_request::{
+use store_api::region_request::{BatchRegionDdlRequest, RegionOpenRequest, RegionRequest};
    BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
 };
 use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
 use crate::config::EngineConfig;
@@ -144,17 +142,6 @@ impl RegionEngine for MetricEngine {
            .map_err(BoxedError::new)
    }
    async fn handle_batch_catchup_requests(
        &self,
        parallelism: usize,
        requests: Vec<(RegionId, RegionCatchupRequest)>,
    ) -> Result<BatchResponses, BoxedError> {
        self.inner
            .handle_batch_catchup_requests(parallelism, requests)
            .await
            .map_err(BoxedError::new)
    }
    async fn handle_batch_ddl_requests(
        &self,
        batch_request: BatchRegionDdlRequest,
@@ -248,26 +235,19 @@ impl RegionEngine for MetricEngine {
                }
            }
            RegionRequest::Truncate(_) => UnsupportedRegionRequestSnafu { request }.fail(),
-            RegionRequest::Delete(delete) => self.inner.delete_region(region_id, delete).await,
+            RegionRequest::Delete(_) => {
-            RegionRequest::Catchup(_) => {
+                if self.inner.is_physical_region(region_id) {
-                let mut response = self
+                    self.inner
-                    .inner
+                        .mito
-                    .handle_batch_catchup_requests(
+                        .handle_request(region_id, request)
-                        1,
+                        .await
-                        vec![(region_id, RegionCatchupRequest::default())],
+                        .context(error::MitoDeleteOperationSnafu)
-                    )
+                        .map(|response| response.affected_rows)
-                    .await
+                } else {
-                    .map_err(BoxedError::new)?;
+                    UnsupportedRegionRequestSnafu { request }.fail()
-                debug_assert_eq!(response.len(), 1);
+                }
                let (resp_region_id, response) = response
                    .pop()
                    .context(error::UnexpectedRequestSnafu {
                        reason: "expected 1 response, but got zero responses",
                    })
                    .map_err(BoxedError::new)?;
                debug_assert_eq!(region_id, resp_region_id);
                return response;
            }
            RegionRequest::Catchup(req) => self.inner.catchup_region(region_id, req).await,
            RegionRequest::BulkInserts(_) => {
                // todo(hl): find a way to support bulk inserts in metric engine.
                UnsupportedRegionRequestSnafu { request }.fail()
@@ -516,17 +496,13 @@ mod test {
    use std::collections::HashMap;
    use common_telemetry::info;
    use common_wal::options::{KafkaWalOptions, WalOptions};
    use mito2::sst::location::region_dir_from_table_dir;
    use mito2::test_util::{kafka_log_store_factory, prepare_test_for_kafka_log_store};
    use store_api::metric_engine_consts::PHYSICAL_TABLE_METADATA_KEY;
    use store_api::mito_engine_options::WAL_OPTIONS_KEY;
    use store_api::region_request::{
        PathType, RegionCloseRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest,
    };
    use super::*;
    use crate::maybe_skip_kafka_log_store_integration_test;
    use crate::test_util::TestEnv;
    #[tokio::test]
@@ -707,128 +683,4 @@ mod test {
            .unwrap_err();
        assert_eq!(err.status_code(), StatusCode::RegionNotFound);
    }
    #[tokio::test]
    async fn test_catchup_regions() {
        common_telemetry::init_default_ut_logging();
        maybe_skip_kafka_log_store_integration_test!();
        let kafka_log_store_factory = kafka_log_store_factory().unwrap();
        let mito_env = mito2::test_util::TestEnv::new()
            .await
            .with_log_store_factory(kafka_log_store_factory.clone());
        let env = TestEnv::with_mito_env(mito_env).await;
        let table_dir = |region_id| format!("table/{region_id}");
        let mut physical_region_ids = vec![];
        let mut logical_region_ids = vec![];
        let num_topics = 3;
        let num_physical_regions = 8;
        let num_logical_regions = 16;
        let parallelism = 2;
        let mut topics = Vec::with_capacity(num_topics);
        for _ in 0..num_topics {
            let topic = prepare_test_for_kafka_log_store(&kafka_log_store_factory)
                .await
                .unwrap();
            topics.push(topic);
        }
        let topic_idx = |id| (id as usize) % num_topics;
        // Creates physical regions
        for i in 0..num_physical_regions {
            let physical_region_id = RegionId::new(1, i);
            physical_region_ids.push(physical_region_id);
            let wal_options = WalOptions::Kafka(KafkaWalOptions {
                topic: topics[topic_idx(i)].clone(),
            });
            env.create_physical_region(
                physical_region_id,
                &table_dir(physical_region_id),
                vec![(
                    WAL_OPTIONS_KEY.to_string(),
                    serde_json::to_string(&wal_options).unwrap(),
                )],
            )
            .await;
            // Creates logical regions for each physical region
            for j in 0..num_logical_regions {
                let logical_region_id = RegionId::new(1024 + i, j);
                logical_region_ids.push(logical_region_id);
                env.create_logical_region(physical_region_id, logical_region_id)
                    .await;
            }
        }
        let metric_engine = env.metric();
        // Closes all regions
        for region_id in logical_region_ids.iter().chain(physical_region_ids.iter()) {
            metric_engine
                .handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {}))
                .await
                .unwrap();
        }
        // Opens all regions and skip the wal
        let requests = physical_region_ids
            .iter()
            .enumerate()
            .map(|(idx, region_id)| {
                let mut options = HashMap::new();
                let wal_options = WalOptions::Kafka(KafkaWalOptions {
                    topic: topics[topic_idx(idx as u32)].clone(),
                });
                options.insert(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new());
                options.insert(
                    WAL_OPTIONS_KEY.to_string(),
                    serde_json::to_string(&wal_options).unwrap(),
                );
                (
                    *region_id,
                    RegionOpenRequest {
                        engine: METRIC_ENGINE_NAME.to_string(),
                        table_dir: table_dir(*region_id),
                        path_type: PathType::Bare,
                        options: options.clone(),
                        skip_wal_replay: true,
                        checkpoint: None,
                    },
                )
            })
            .collect::<Vec<_>>();
        info!("Open batch regions with parallelism: {parallelism}");
        metric_engine
            .handle_batch_open_requests(parallelism, requests)
            .await
            .unwrap();
        {
            let state = metric_engine.inner.state.read().unwrap();
            for logical_region in &logical_region_ids {
                assert!(!state.logical_regions().contains_key(logical_region));
            }
        }
        let catch_requests = physical_region_ids
            .iter()
            .map(|region_id| {
                (
                    *region_id,
                    RegionCatchupRequest {
                        set_writable: true,
                        ..Default::default()
                    },
                )
            })
            .collect::<Vec<_>>();
        metric_engine
            .handle_batch_catchup_requests(parallelism, catch_requests)
            .await
            .unwrap();
        {
            let state = metric_engine.inner.state.read().unwrap();
            for logical_region in &logical_region_ids {
                assert!(state.logical_regions().contains_key(logical_region));
            }
        }
    }
 }
--- a/src/metric-engine/src/engine/alter.rs
+++ b/src/metric-engine/src/engine/alter.rs
@@ -324,9 +324,9 @@ mod test {
        let physical_region_id2 = RegionId::new(1024, 1);
        let logical_region_id1 = RegionId::new(1025, 0);
        let logical_region_id2 = RegionId::new(1025, 1);
-        env.create_physical_region(physical_region_id1, "/test_dir1", vec![])
+        env.create_physical_region(physical_region_id1, "/test_dir1")
            .await;
-        env.create_physical_region(physical_region_id2, "/test_dir2", vec![])
+        env.create_physical_region(physical_region_id2, "/test_dir2")
            .await;
        let region_create_request1 = crate::test_util::create_logical_region_request(
--- a/src/metric-engine/src/engine/catchup.rs
+++ b/src/metric-engine/src/engine/catchup.rs
@@ -12,45 +12,51 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use std::collections::HashMap;
+use common_telemetry::debug;
 use common_error::ext::BoxedError;
 use snafu::{OptionExt, ResultExt};
-use store_api::region_engine::{BatchResponses, RegionEngine};
+use store_api::region_engine::RegionEngine;
-use store_api::region_request::{RegionCatchupRequest, ReplayCheckpoint};
+use store_api::region_request::{
    AffectedRows, RegionCatchupRequest, RegionRequest, ReplayCheckpoint,
 };
 use store_api::storage::RegionId;
 use crate::engine::MetricEngineInner;
-use crate::error::{BatchCatchupMitoRegionSnafu, PhysicalRegionNotFoundSnafu, Result};
+use crate::error::{
    MitoCatchupOperationSnafu, PhysicalRegionNotFoundSnafu, Result, UnsupportedRegionRequestSnafu,
 };
 use crate::utils;
 impl MetricEngineInner {
-    pub async fn handle_batch_catchup_requests(
+    pub async fn catchup_region(
        &self,
-        parallelism: usize,
+        region_id: RegionId,
-        requests: Vec<(RegionId, RegionCatchupRequest)>,
+        req: RegionCatchupRequest,
-    ) -> Result<BatchResponses> {
+    ) -> Result<AffectedRows> {
-        let mut all_requests = Vec::with_capacity(requests.len() * 2);
+        if !self.is_physical_region(region_id) {
-        let mut physical_region_options_list = Vec::with_capacity(requests.len());
+            return UnsupportedRegionRequestSnafu {
                request: RegionRequest::Catchup(req),
            }
            .fail();
        }
        let data_region_id = utils::to_data_region_id(region_id);
        let physical_region_options = *self
            .state
            .read()
            .unwrap()
            .physical_region_states()
            .get(&data_region_id)
            .context(PhysicalRegionNotFoundSnafu {
                region_id: data_region_id,
            })?
            .options();
-        for (region_id, req) in requests {
+        let metadata_region_id = utils::to_metadata_region_id(region_id);
-            let metadata_region_id = utils::to_metadata_region_id(region_id);
+        // TODO(weny): improve the catchup, we can read the wal entries only once.
-            let data_region_id = utils::to_data_region_id(region_id);
+        debug!("Catchup metadata region {metadata_region_id}");
-
+        self.mito
-            let physical_region_options = *self
+            .handle_request(
                .state
                .read()
                .unwrap()
                .physical_region_states()
                .get(&data_region_id)
                .context(PhysicalRegionNotFoundSnafu {
                    region_id: data_region_id,
                })?
                .options();
            physical_region_options_list.push((data_region_id, physical_region_options));
            all_requests.push((
                metadata_region_id,
-                RegionCatchupRequest {
+                RegionRequest::Catchup(RegionCatchupRequest {
                    set_writable: req.set_writable,
                    entry_id: req.metadata_entry_id,
                    metadata_entry_id: None,
@@ -59,11 +65,16 @@ impl MetricEngineInner {
                        entry_id: c.metadata_entry_id.unwrap_or_default(),
                        metadata_entry_id: None,
                    }),
-                },
+                }),
-            ));
+            )
-            all_requests.push((
+            .await
            .context(MitoCatchupOperationSnafu)?;
        debug!("Catchup data region {data_region_id}");
        self.mito
            .handle_request(
                data_region_id,
-                RegionCatchupRequest {
+                RegionRequest::Catchup(RegionCatchupRequest {
                    set_writable: req.set_writable,
                    entry_id: req.entry_id,
                    metadata_entry_id: None,
@@ -72,45 +83,14 @@ impl MetricEngineInner {
                        entry_id: c.entry_id,
                        metadata_entry_id: None,
                    }),
-                },
+                }),
-            ));
+            )
        }
        let mut results = self
            .mito
            .handle_batch_catchup_requests(parallelism, all_requests)
            .await
-            .context(BatchCatchupMitoRegionSnafu {})?
+            .context(MitoCatchupOperationSnafu)
-            .into_iter()
+            .map(|response| response.affected_rows)?;
            .collect::<HashMap<_, _>>();
-        let mut responses = Vec::with_capacity(physical_region_options_list.len());
+        self.recover_states(region_id, physical_region_options)
-        for (physical_region_id, physical_region_options) in physical_region_options_list {
+            .await?;
-            let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
+        Ok(0)
            let data_region_id = utils::to_data_region_id(physical_region_id);
            let metadata_region_result = results.remove(&metadata_region_id);
            let data_region_result = results.remove(&data_region_id);
            // Pass the optional `metadata_region_result` and `data_region_result` to
            // `recover_physical_region_with_results`. This function handles errors for each
            // catchup physical region request, allowing the process to continue with the
            // remaining regions even if some requests fail.
            let response = self
                .recover_physical_region_with_results(
                    metadata_region_result,
                    data_region_result,
                    physical_region_id,
                    physical_region_options,
                    // Note: We intentionally don’t close the region if recovery fails.
                    // Closing it here might confuse the region server since it links RegionIds to Engines.
                    // If recovery didn’t succeed, the region should stay open.
                    false,
                )
                .await
                .map_err(BoxedError::new);
            responses.push((physical_region_id, response));
        }
        Ok(responses)
    }
 }
--- a/src/metric-engine/src/engine/create.rs
+++ b/src/metric-engine/src/engine/create.rs
@@ -528,7 +528,7 @@ impl MetricEngineInner {
        // set data region options
        set_data_region_options(
            &mut data_region_request.options,
-            self.config.sparse_primary_key_encoding,
+            self.config.experimental_sparse_primary_key_encoding,
        );
        data_region_request
@@ -828,9 +828,9 @@ mod test {
        let physical_region_id2 = RegionId::new(1024, 1);
        let logical_region_id1 = RegionId::new(1025, 0);
        let logical_region_id2 = RegionId::new(1025, 1);
-        env.create_physical_region(physical_region_id1, "/test_dir1", vec![])
+        env.create_physical_region(physical_region_id1, "/test_dir1")
            .await;
-        env.create_physical_region(physical_region_id2, "/test_dir2", vec![])
+        env.create_physical_region(physical_region_id2, "/test_dir2")
            .await;
        let region_create_request1 =
--- a/src/metric-engine/src/engine/flush.rs
+++ b/src/metric-engine/src/engine/flush.rs
@@ -76,7 +76,7 @@ mod tests {
        ];
        for (phy_region_id, logi_region_ids) in &phy_to_logi {
-            env.create_physical_region(*phy_region_id, &TestEnv::default_table_dir(), vec![])
+            env.create_physical_region(*phy_region_id, &TestEnv::default_table_dir())
                .await;
            for logi_region_id in logi_region_ids {
                env.create_logical_region(*phy_region_id, *logi_region_id)
@@ -119,7 +119,6 @@ mod tests {
                    .index_file_path
                    .map(|path| path.replace(&e.file_id, "<file_id>"));
                e.file_id = "<file_id>".to_string();
                e.index_file_id = e.index_file_id.map(|_| "<index_file_id>".to_string());
                format!("\n{:?}", e)
            })
            .sorted()
@@ -128,12 +127,12 @@ mod tests {
        assert_eq!(
            debug_format,
            r#"
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
        );
        // list from storage
        let storage_entries = mito
--- a/src/metric-engine/src/engine/open.rs
+++ b/src/metric-engine/src/engine/open.rs
@@ -47,7 +47,6 @@ impl MetricEngineInner {
        for (region_id, request) in requests {
            if !request.is_physical_table() {
                warn!("Skipping non-physical table open request: {region_id}");
                continue;
            }
            let physical_region_options = PhysicalRegionOptions::try_from(&request.options)?;
@@ -73,19 +72,17 @@ impl MetricEngineInner {
            let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
            let data_region_id = utils::to_data_region_id(physical_region_id);
            let metadata_region_result = results.remove(&metadata_region_id);
-            let data_region_result: Option<std::result::Result<RegionResponse, BoxedError>> =
+            let data_region_result = results.remove(&data_region_id);
                results.remove(&data_region_id);
            // Pass the optional `metadata_region_result` and `data_region_result` to
-            // `recover_physical_region_with_results`. This function handles errors for each
+            // `open_physical_region_with_results`. This function handles errors for each
            // open physical region request, allowing the process to continue with the
            // remaining regions even if some requests fail.
            let response = self
-                .recover_physical_region_with_results(
+                .open_physical_region_with_results(
                    metadata_region_result,
                    data_region_result,
                    physical_region_id,
                    physical_region_options,
                    true,
                )
                .await
                .map_err(BoxedError::new);
@@ -110,13 +107,12 @@ impl MetricEngineInner {
        }
    }
-    pub(crate) async fn recover_physical_region_with_results(
+    async fn open_physical_region_with_results(
        &self,
        metadata_region_result: Option<std::result::Result<RegionResponse, BoxedError>>,
        data_region_result: Option<std::result::Result<RegionResponse, BoxedError>>,
        physical_region_id: RegionId,
        physical_region_options: PhysicalRegionOptions,
        close_region_on_failure: bool,
    ) -> Result<RegionResponse> {
        let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
        let data_region_id = utils::to_data_region_id(physical_region_id);
@@ -140,10 +136,8 @@ impl MetricEngineInner {
            .recover_states(physical_region_id, physical_region_options)
            .await
        {
-            if close_region_on_failure {
+            self.close_physical_region_on_recovery_failure(physical_region_id)
-                self.close_physical_region_on_recovery_failure(physical_region_id)
+                .await;
                    .await;
            }
            return Err(err);
        }
        Ok(data_region_response)
@@ -227,7 +221,7 @@ impl MetricEngineInner {
        let mut data_region_options = request.options;
        set_data_region_options(
            &mut data_region_options,
-            self.config.sparse_primary_key_encoding,
+            self.config.experimental_sparse_primary_key_encoding,
        );
        let open_data_region_request = RegionOpenRequest {
            table_dir: request.table_dir.clone(),
--- a/src/metric-engine/src/engine/options.rs
+++ b/src/metric-engine/src/engine/options.rs
@@ -17,12 +17,12 @@
 use std::collections::HashMap;
 use store_api::metric_engine_consts::{
    MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING,
    METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION,
    METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION_DEFAULT,
    METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION,
    METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION_DEFAULT, METRIC_ENGINE_INDEX_TYPE_OPTION,
 };
 use store_api::mito_engine_options::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING;
 use crate::error::{Error, ParseRegionOptionsSnafu, Result};
--- a/src/metric-engine/src/engine/put.rs
+++ b/src/metric-engine/src/engine/put.rs
@@ -16,15 +16,13 @@ use api::v1::{Rows, WriteHint};
 use common_telemetry::{error, info};
 use snafu::{OptionExt, ensure};
 use store_api::codec::PrimaryKeyEncoding;
-use store_api::region_request::{
+use store_api::region_request::{AffectedRows, RegionPutRequest};
    AffectedRows, RegionDeleteRequest, RegionPutRequest, RegionRequest,
 };
 use store_api::storage::{RegionId, TableId};
 use crate::engine::MetricEngineInner;
 use crate::error::{
    ColumnNotFoundSnafu, ForbiddenPhysicalAlterSnafu, LogicalRegionNotFoundSnafu,
-    PhysicalRegionNotFoundSnafu, Result, UnsupportedRegionRequestSnafu,
+    PhysicalRegionNotFoundSnafu, Result,
 };
 use crate::metrics::{FORBIDDEN_OPERATION_COUNT, MITO_OPERATION_ELAPSED};
 use crate::row_modifier::RowsIter;
@@ -52,27 +50,6 @@ impl MetricEngineInner {
        }
    }
    /// Dispatch region delete request
    pub async fn delete_region(
        &self,
        region_id: RegionId,
        request: RegionDeleteRequest,
    ) -> Result<AffectedRows> {
        if self.is_physical_region(region_id) {
            info!(
                "Metric region received delete request {request:?} on physical region {region_id:?}"
            );
            FORBIDDEN_OPERATION_COUNT.inc();
            UnsupportedRegionRequestSnafu {
                request: RegionRequest::Delete(request),
            }
            .fail()
        } else {
            self.delete_logical_region(region_id, request).await
        }
    }
    async fn put_logical_region(
        &self,
        logical_region_id: RegionId,
@@ -82,13 +59,30 @@ impl MetricEngineInner {
            .with_label_values(&["put"])
            .start_timer();
-        let (physical_region_id, data_region_id, primary_key_encoding) =
+        let (physical_region_id, data_region_id, primary_key_encoding) = {
-            self.find_data_region_meta(logical_region_id)?;
+            let state = self.state.read().unwrap();
            let physical_region_id = *state
                .logical_regions()
                .get(&logical_region_id)
                .with_context(|| LogicalRegionNotFoundSnafu {
                    region_id: logical_region_id,
                })?;
            let data_region_id = to_data_region_id(physical_region_id);
-        self.verify_rows(logical_region_id, physical_region_id, &request.rows)
+            let primary_key_encoding = state.get_primary_key_encoding(data_region_id).context(
                PhysicalRegionNotFoundSnafu {
                    region_id: data_region_id,
                },
            )?;
            (physical_region_id, data_region_id, primary_key_encoding)
        };
        self.verify_put_request(logical_region_id, physical_region_id, &request)
            .await?;
        // write to data region
        // TODO: retrieve table name
        self.modify_rows(
            physical_region_id,
@@ -101,74 +95,19 @@ impl MetricEngineInner {
                primary_key_encoding: api::v1::PrimaryKeyEncoding::Sparse.into(),
            });
        }
-        self.data_region
+        self.data_region.write_data(data_region_id, request).await
            .write_data(data_region_id, RegionRequest::Put(request))
            .await
    }
-    async fn delete_logical_region(
+    /// Verifies a put request for a logical region against its corresponding metadata region.
        &self,
        logical_region_id: RegionId,
        mut request: RegionDeleteRequest,
    ) -> Result<AffectedRows> {
        let _timer = MITO_OPERATION_ELAPSED
            .with_label_values(&["delete"])
            .start_timer();
        let (physical_region_id, data_region_id, primary_key_encoding) =
            self.find_data_region_meta(logical_region_id)?;
        self.verify_rows(logical_region_id, physical_region_id, &request.rows)
            .await?;
        // write to data region
        // TODO: retrieve table name
        self.modify_rows(
            physical_region_id,
            logical_region_id.table_id(),
            &mut request.rows,
            primary_key_encoding,
        )?;
        if primary_key_encoding == PrimaryKeyEncoding::Sparse {
            request.hint = Some(WriteHint {
                primary_key_encoding: api::v1::PrimaryKeyEncoding::Sparse.into(),
            });
        }
        self.data_region
            .write_data(data_region_id, RegionRequest::Delete(request))
            .await
    }
    fn find_data_region_meta(
        &self,
        logical_region_id: RegionId,
    ) -> Result<(RegionId, RegionId, PrimaryKeyEncoding)> {
        let state = self.state.read().unwrap();
        let physical_region_id = *state
            .logical_regions()
            .get(&logical_region_id)
            .with_context(|| LogicalRegionNotFoundSnafu {
                region_id: logical_region_id,
            })?;
        let data_region_id = to_data_region_id(physical_region_id);
        let primary_key_encoding = state.get_primary_key_encoding(data_region_id).context(
            PhysicalRegionNotFoundSnafu {
                region_id: data_region_id,
            },
        )?;
        Ok((physical_region_id, data_region_id, primary_key_encoding))
    }
    /// Verifies a request for a logical region against its corresponding metadata region.
    ///
    /// Includes:
    /// - Check if the logical region exists
    /// - Check if the columns exist
-    async fn verify_rows(
+    async fn verify_put_request(
        &self,
        logical_region_id: RegionId,
        physical_region_id: RegionId,
-        rows: &Rows,
+        request: &RegionPutRequest,
    ) -> Result<()> {
        // Check if the region exists
        let data_region_id = to_data_region_id(physical_region_id);
@@ -189,7 +128,7 @@ impl MetricEngineInner {
                region_id: data_region_id,
            })?
            .physical_columns();
-        for col in &rows.schema {
+        for col in &request.rows.schema {
            ensure!(
                physical_columns.contains_key(&col.column_name),
                ColumnNotFoundSnafu {
--- a/src/metric-engine/src/error.rs
+++ b/src/metric-engine/src/error.rs
@@ -50,13 +50,6 @@ pub enum Error {
        location: Location,
    },
    #[snafu(display("Failed to batch catchup mito region"))]
    BatchCatchupMitoRegion {
        source: BoxedError,
        #[snafu(implicit)]
        location: Location,
    },
    #[snafu(display("No open region result for region {}", region_id))]
    NoOpenRegionResult {
        region_id: RegionId,
@@ -149,6 +142,20 @@ pub enum Error {
        location: Location,
    },
    #[snafu(display("Mito delete operation fails"))]
    MitoDeleteOperation {
        source: BoxedError,
        #[snafu(implicit)]
        location: Location,
    },
    #[snafu(display("Mito catchup operation fails"))]
    MitoCatchupOperation {
        source: BoxedError,
        #[snafu(implicit)]
        location: Location,
    },
    #[snafu(display("Mito sync operation fails"))]
    MitoSyncOperation {
        source: BoxedError,
@@ -350,10 +357,11 @@ impl ErrorExt for Error {
            | CloseMitoRegion { source, .. }
            | MitoReadOperation { source, .. }
            | MitoWriteOperation { source, .. }
            | MitoCatchupOperation { source, .. }
            | MitoFlushOperation { source, .. }
            | MitoDeleteOperation { source, .. }
            | MitoSyncOperation { source, .. }
-            | BatchOpenMitoRegion { source, .. }
+            | BatchOpenMitoRegion { source, .. } => source.status_code(),
            | BatchCatchupMitoRegion { source, .. } => source.status_code(),
            EncodePrimaryKey { source, .. } => source.status_code(),
--- a/src/metric-engine/src/metadata_region.rs
+++ b/src/metric-engine/src/metadata_region.rs
@@ -536,7 +536,7 @@ impl MetadataRegion {
            .collect();
        let rows = Rows { schema: cols, rows };
-        RegionDeleteRequest { rows, hint: None }
+        RegionDeleteRequest { rows }
    }
    /// Add logical regions to the metadata region.
--- a/src/metric-engine/src/test_util.rs
+++ b/src/metric-engine/src/test_util.rs
@@ -76,17 +76,6 @@ impl TestEnv {
        }
    }
    /// Returns a new env with specific `prefix` and `mito_env` for test.
    pub async fn with_mito_env(mut mito_env: MitoTestEnv) -> Self {
        let mito = mito_env.create_engine(MitoConfig::default()).await;
        let metric = MetricEngine::try_new(mito.clone(), EngineConfig::default()).unwrap();
        Self {
            mito_env,
            mito,
            metric,
        }
    }
    pub fn data_home(&self) -> String {
        let env_root = self.mito_env.data_home().to_string_lossy().to_string();
        join_dir(&env_root, "data")
@@ -136,12 +125,7 @@ impl TestEnv {
    }
    /// Create regions in [MetricEngine] with specific `physical_region_id`.
-    pub async fn create_physical_region(
+    pub async fn create_physical_region(&self, physical_region_id: RegionId, table_dir: &str) {
        &self,
        physical_region_id: RegionId,
        table_dir: &str,
        options: Vec<(String, String)>,
    ) {
        let region_create_request = RegionCreateRequest {
            engine: METRIC_ENGINE_NAME.to_string(),
            column_metadatas: vec![
@@ -167,7 +151,6 @@ impl TestEnv {
            primary_key: vec![],
            options: [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())]
                .into_iter()
                .chain(options.into_iter())
                .collect(),
            table_dir: table_dir.to_string(),
            path_type: PathType::Bare, // Use Bare path type for engine regions
@@ -248,7 +231,7 @@ impl TestEnv {
    /// under [`default_logical_region_id`].
    pub async fn init_metric_region(&self) {
        let physical_region_id = self.default_physical_region_id();
-        self.create_physical_region(physical_region_id, &Self::default_table_dir(), vec![])
+        self.create_physical_region(physical_region_id, &Self::default_table_dir())
            .await;
        let logical_region_id = self.default_logical_region_id();
        self.create_logical_region(physical_region_id, logical_region_id)
@@ -441,22 +424,6 @@ pub fn build_rows(num_tags: usize, num_rows: usize) -> Vec<Row> {
    rows
 }
 #[macro_export]
 /// Skip the test if the environment variable `GT_KAFKA_ENDPOINTS` is not set.
 ///
 /// The format of the environment variable is:
 /// ```text
 /// GT_KAFKA_ENDPOINTS=localhost:9092,localhost:9093
 /// ```
 macro_rules! maybe_skip_kafka_log_store_integration_test {
    () => {
        if std::env::var("GT_KAFKA_ENDPOINTS").is_err() {
            common_telemetry::warn!("The kafka endpoints is empty, skipping the test");
            return;
        }
    };
 }
 #[cfg(test)]
 mod test {
    use object_store::ObjectStore;
--- a/src/mito-codec/src/key_values.rs
+++ b/src/mito-codec/src/key_values.rs
@@ -278,41 +278,14 @@ impl SparseReadRowHelper {
        primary_key_encoding: PrimaryKeyEncoding,
    ) -> SparseReadRowHelper {
        if primary_key_encoding == PrimaryKeyEncoding::Sparse {
-            // Optimized case: when schema has exactly 3 columns (primary key, timestamp, and one field),
+            // We can skip build the indices for sparse primary key encoding.
-            // we can directly use their indices in order without building an explicit mapping.
+            // The order of the columns is encoded primary key, timestamp, field columns.
-            // The column order is: encoded primary key, timestamp, and field.
+            let indices = rows
            if rows.schema.len() == 3 {
                let indices = rows
                    .schema
                    .iter()
                    .enumerate()
                    .map(|(index, _)| Some(index))
                    .collect();
                return SparseReadRowHelper {
                    indices,
                    num_primary_key_column: 1,
                };
            };
            let mut indices = Vec::with_capacity(rows.schema.len());
            let name_to_index: HashMap<_, _> = rows
                .schema
                .iter()
                .enumerate()
-                .map(|(index, col)| (&col.column_name, index))
+                .map(|(index, _)| Some(index))
                .collect();
            indices.extend(
                rows.schema[0..2]
                    .iter()
                    .enumerate()
                    .map(|(index, _)| Some(index)),
            );
            // Iterate columns and find field columns.
            for column in metadata.field_columns() {
                // Get index in request for each field column.
                let index = name_to_index.get(&column.column_schema.name);
                indices.push(index.copied());
            }
            return SparseReadRowHelper {
                indices,
                num_primary_key_column: 1,
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -477,8 +477,6 @@ fn flat_merge_iterator_bench(c: &mut Criterion) {
                            bulk_part.batch.clone(),
                            context.clone(),
                            None, // No sequence filter
                            1024, // 1024 hosts per part
                            None, // No mem_scan_metrics
                        );
                        iters.push(Box::new(iter) as _);
                    }
@@ -536,13 +534,8 @@ fn bulk_part_record_batch_iter_filter(c: &mut Criterion) {
            );
            // Create and iterate over BulkPartRecordBatchIter with filter
-            let iter = BulkPartRecordBatchIter::new(
+            let iter =
-                record_batch_with_filter.clone(),
+                BulkPartRecordBatchIter::new(record_batch_with_filter.clone(), context, None);
                context,
                None, // No sequence filter
                4096, // 4096 hosts
                None, // No mem_scan_metrics
            );
            // Consume all batches
            for batch_result in iter {
@@ -566,13 +559,7 @@ fn bulk_part_record_batch_iter_filter(c: &mut Criterion) {
            );
            // Create and iterate over BulkPartRecordBatchIter
-            let iter = BulkPartRecordBatchIter::new(
+            let iter = BulkPartRecordBatchIter::new(record_batch_no_filter.clone(), context, None);
                record_batch_no_filter.clone(),
                context,
                None, // No sequence filter
                4096, // 4096 hosts
                None, // No mem_scan_metrics
            );
            // Consume all batches
            for batch_result in iter {
--- a/src/mito2/benches/simple_bulk_memtable.rs
+++ b/src/mito2/benches/simple_bulk_memtable.rs
@@ -20,11 +20,12 @@ use criterion::{Criterion, black_box, criterion_group, criterion_main};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::ColumnSchema;
 use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable;
-use mito2::memtable::{KeyValues, Memtable, MemtableRanges, RangesOptions};
+use mito2::memtable::{KeyValues, Memtable, MemtableRanges};
 use mito2::read;
 use mito2::read::Source;
 use mito2::read::dedup::DedupReader;
 use mito2::read::merge::MergeReaderBuilder;
 use mito2::read::scan_region::PredicateGroup;
 use mito2::region::options::MergeMode;
 use mito2::test_util::column_metadata_to_column_schema;
 use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
@@ -125,7 +126,9 @@ fn create_memtable_with_rows(num_batches: usize) -> SimpleBulkMemtable {
 }
 async fn flush(mem: &SimpleBulkMemtable) {
-    let MemtableRanges { ranges, .. } = mem.ranges(None, RangesOptions::for_flush()).unwrap();
+    let MemtableRanges { ranges, .. } = mem
        .ranges(None, PredicateGroup::default(), None, true)
        .unwrap();
    let mut source = if ranges.len() == 1 {
        let only_range = ranges.into_values().next().unwrap();
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -213,11 +213,7 @@ impl AccessLayer {
    }
    /// Deletes a SST file (and its index file if it has one) with given file id.
-    pub(crate) async fn delete_sst(
+    pub(crate) async fn delete_sst(&self, region_file_id: &RegionFileId) -> Result<()> {
        &self,
        region_file_id: &RegionFileId,
        index_file_id: &RegionFileId,
    ) -> Result<()> {
        let path = location::sst_file_path(&self.table_dir, *region_file_id, self.path_type);
        self.object_store
            .delete(&path)
@@ -226,7 +222,7 @@ impl AccessLayer {
                file_id: region_file_id.file_id(),
            })?;
-        let path = location::index_file_path(&self.table_dir, *index_file_id, self.path_type);
+        let path = location::index_file_path(&self.table_dir, *region_file_id, self.path_type);
        self.object_store
            .delete(&path)
            .await
--- a/Show More
+++ b/Show More