chore: upgrade DataFusion family

Signed-off-by: luofucong <luofc@foxmail.com>
style: remove unused imports (#7567 )
2026-01-14 09:12:57 +00:00 · 2026-01-14 17:05:55 +08:00 · 2026-01-14 07:59:40 +00:00 · 2026-01-14 02:24:36 +00:00 · 2026-01-13 09:17:09 +00:00 · 2026-01-13 04:10:45 +00:00
529 changed files with 30234 additions and 7279 deletions
--- a/.github/actions/setup-greptimedb-cluster/action.yml
+++ b/.github/actions/setup-greptimedb-cluster/action.yml
@@ -70,19 +70,23 @@ runs:
        --wait \
        --wait-for-jobs
  - name: Wait for GreptimeDB
-    shell: bash
-    run: |
-      while true; do
-        PHASE=$(kubectl -n my-greptimedb get gtc my-greptimedb -o jsonpath='{.status.clusterPhase}')
-        if [ "$PHASE" == "Running" ]; then
-          echo "Cluster is ready"
-          break
-        else
-          echo "Cluster is not ready yet: Current phase: $PHASE"
-          kubectl get pods -n my-greptimedb
-          sleep 5 # wait for 5 seconds before check again.
-        fi
-      done
+    uses: nick-fields/retry@v3
+    with:
+      timeout_minutes: 3
+      max_attempts: 1
+      shell: bash
+      command: |
+        while true; do
+          PHASE=$(kubectl -n my-greptimedb get gtc my-greptimedb -o jsonpath='{.status.clusterPhase}')
+          if [ "$PHASE" == "Running" ]; then
+            echo "Cluster is ready"
+            break
+          else
+            echo "Cluster is not ready yet: Current phase: $PHASE"
+            kubectl get pods -n my-greptimedb
+            sleep 5 # wait for 5 seconds before check again.
+          fi
+        done
  - name: Print GreptimeDB info
    if: always()
    shell: bash
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -755,7 +755,7 @@ jobs:
        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait

      - name: Run nextest cases
-        run: cargo nextest run --workspace -F dashboard -F pg_kvbackend -F mysql_kvbackend
+        run: cargo nextest run --workspace -F dashboard -F pg_kvbackend -F mysql_kvbackend -F vector_index
        env:
          CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=mold"
          RUST_BACKTRACE: 1
@@ -813,7 +813,7 @@ jobs:
        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait

      - name: Run nextest cases
-        run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend
+        run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend -F vector_index
        env:
          CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=mold"
          RUST_BACKTRACE: 1
--- a/.gitignore
+++ b/.gitignore
@@ -67,3 +67,6 @@ greptimedb_data

 # Claude code
 CLAUDE.md
+
+# AGENTS.md
+AGENTS.md
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,8 +15,11 @@ repos:
    rev: v1.0
    hooks:
    -    id: fmt
+         args: ["--", "--check"]
+         stages: [commit-msg]
    -    id: clippy
         args: ["--workspace", "--all-targets", "--all-features", "--", "-D", "warnings"]
-         stages: [pre-push]
+         stages: [commit-msg]
    -    id: cargo-check
         args: ["--workspace", "--all-targets", "--all-features"]
+         stages: [commit-msg]
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -102,6 +102,30 @@ like `feat`/`fix`/`docs`, with a concise summary of code change following. AVOID

 All commit messages SHOULD adhere to the [Conventional Commits specification](https://conventionalcommits.org/).

+## AI-Assisted contributions
+
+We have the following policy for AI-assisted PRs:
+
+- The PR author should **understand the core ideas** behind the implementation **end-to-end**, and be able to justify the design and code during review.
+- **Calls out unknowns and assumptions**. It's okay to not fully understand some bits of AI generated code. You should comment on these cases and point them out to reviewers so that they can use their knowledge of the codebase to clear up any concerns. For example, you might comment "calling this function here seems to work but I'm not familiar with how it works internally, I wonder if there's a race condition if it is called concurrently".
+
+### Why fully AI-generated PRs without understanding are not helpful
+
+Today, AI tools cannot reliably make complex changes to GreptimeDB on their own, which is why we rely on pull requests and code review.
+
+The purposes of code review are:
+
+1. Finish the intended task.
+2. Share knowledge between authors and reviewers, as a long-term investment in the project. For this reason, even if someone familiar with the codebase can finish a task quickly, we're still happy to help a new contributor work on it even if it takes longer.
+
+An AI dump for an issue doesn’t meet these purposes. Maintainers could finish the task faster by using AI directly, and the submitters gain little knowledge if they act only as a pass through AI proxy without understanding.
+
+Please understand the reviewing capacity is **very limited** for the project, so large PRs which appear to not have the requisite understanding might not get reviewed, and eventually closed or redirected.
+
+### Better ways to contribute than an “AI dump”
+
+It's recommended to write a high-quality issue with a clear problem statement and a minimal, reproducible example. This can make it easier for others to contribute.
+
 ## Getting Help

 There are many ways to get help when you're stuck. It is recommended to ask for help by opening an issue, with a detailed description
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -75,7 +75,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "1.0.0-beta.3"
+version = "1.0.0-beta.4"
 edition = "2024"
 license = "Apache-2.0"

@@ -100,12 +100,13 @@ rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }
 # See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
 ahash = { version = "0.8", features = ["compile-time-rng"] }
 aquamarine = "0.6"
-arrow = { version = "56.2", features = ["prettyprint"] }
-arrow-array = { version = "56.2", default-features = false, features = ["chrono-tz"] }
-arrow-buffer = "56.2"
-arrow-flight = "56.2"
-arrow-ipc = { version = "56.2", default-features = false, features = ["lz4", "zstd"] }
-arrow-schema = { version = "56.2", features = ["serde"] }
+arrow = { version = "57.0", features = ["prettyprint"] }
+arrow-array = { version = "57.0", default-features = false, features = ["chrono-tz"] }
+arrow-buffer = "57.0"
+arrow-cast = "57.0"
+arrow-flight = "57.0"
+arrow-ipc = { version = "57.0", default-features = false, features = ["lz4", "zstd"] }
+arrow-schema = { version = "57.0", features = ["serde"] }
 async-stream = "0.3"
 async-trait = "0.1"
 # Remember to update axum-extra, axum-macros when updating axum
@@ -119,38 +120,39 @@ bitflags = "2.4.1"
 bytemuck = "1.12"
 bytes = { version = "1.7", features = ["serde"] }
 chrono = { version = "0.4", features = ["serde"] }
-chrono-tz = { version = "0.10.1", features = ["case-insensitive"] }
+chrono-tz = { version = "0.10", features = ["case-insensitive"] }
 clap = { version = "4.4", features = ["derive"] }
 config = "0.13.0"
 const_format = "0.2"
 crossbeam-utils = "0.8"
 dashmap = "6.1"
-datafusion = "50"
-datafusion-common = "50"
-datafusion-expr = "50"
-datafusion-functions = "50"
-datafusion-functions-aggregate-common = "50"
-datafusion-optimizer = "50"
-datafusion-orc = "0.5"
-datafusion-pg-catalog = "0.12.3"
-datafusion-physical-expr = "50"
-datafusion-physical-plan = "50"
-datafusion-sql = "50"
-datafusion-substrait = "50"
+datafusion = "51.0"
+datafusion-common = "51.0"
+datafusion-datasource = "51.0"
+datafusion-expr = "51.0"
+datafusion-functions = "51.0"
+datafusion-functions-aggregate-common = "51.0"
+datafusion-optimizer = "51.0"
+datafusion-orc = { git = "https://github.com/GreptimeTeam/datafusion-orc.git", rev = "35f2e04bf81f2ab7b6f86c0450d6a77b7098d43e" }
+datafusion-pg-catalog = "0.13"
+datafusion-physical-expr = "51.0"
+datafusion-physical-plan = "51.0"
+datafusion-sql = "51.0"
+datafusion-substrait = "51.0"
 deadpool = "0.12"
 deadpool-postgres = "0.14"
 derive_builder = "0.20"
 derive_more = { version = "2.1", features = ["full"] }
 dotenv = "0.15"
 either = "1.15"
-etcd-client = { version = "0.16.1", features = [
+etcd-client = { version = "0.17", features = [
    "tls",
    "tls-roots",
 ] }
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "173efe5ec62722089db7c531c0b0d470a072b915" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "69499de6d38d9032101fa8a9e10d375e124ca8d3" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -161,7 +163,7 @@ itertools = "0.14"
 jsonb = { git = "https://github.com/databendlabs/jsonb.git", rev = "8c8d2fc294a39f3ff08909d60f718639cfba3875", default-features = false }
 lazy_static = "1.4"
 local-ip-address = "0.6"
-loki-proto = { git = "https://github.com/GreptimeTeam/loki-proto.git", rev = "3b7cd33234358b18ece977bf689dc6fb760f29ab" }
+loki-proto = { git = "https://github.com/GreptimeTeam/loki-proto.git", rev = "a73a6b83eeb014645aac527b456816a81bd6b472" }
 meter-core = { git = "https://github.com/GreptimeTeam/greptime-meter.git", rev = "5618e779cf2bb4755b499c630fba4c35e91898cb" }
 mockall = "0.13"
 moka = "0.12"
@@ -171,7 +173,7 @@ notify = "8.0"
 num_cpus = "1.16"
 object_store_opendal = "0.54"
 once_cell = "1.18"
-opentelemetry-proto = { version = "0.30", features = [
+opentelemetry-proto = { version = "0.31", features = [
    "gen-tonic",
    "metrics",
    "trace",
@@ -179,18 +181,18 @@ opentelemetry-proto = { version = "0.30", features = [
    "logs",
 ] }
 ordered-float = { version = "4.3", features = ["serde"] }
-otel-arrow-rust = { git = "https://github.com/GreptimeTeam/otel-arrow", rev = "2d64b7c0fa95642028a8205b36fe9ea0b023ec59", features = [
+otel-arrow-rust = { git = "https://github.com/GreptimeTeam/otel-arrow", rev = "5da284414e9b14f678344b51e5292229e4b5f8d2", features = [
    "server",
 ] }
 parking_lot = "0.12"
-parquet = { version = "56.2", default-features = false, features = ["arrow", "async", "object_store"] }
+parquet = { version = "57.0", default-features = false, features = ["arrow", "async", "object_store"] }
 paste = "1.0"
 pin-project = "1.0"
 pretty_assertions = "1.4.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { version = "0.6", features = ["ser"] }
-prost = { version = "0.13", features = ["no-recursion-limit"] }
-prost-types = "0.13"
+promql-parser = { version = "0.7.1", features = ["ser"] }
+prost = { version = "0.14", features = ["no-recursion-limit"] }
+prost-types = "0.14"
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.9"
 ratelimit = "0.10"
@@ -202,6 +204,7 @@ reqwest = { version = "0.12", default-features = false, features = [
    "stream",
    "multipart",
 ] }
+url = "2.3"
 # Branch: feat/request-timeout
 rskafka = { git = "https://github.com/GreptimeTeam/rskafka.git", rev = "f5688f83e7da591cda3f2674c2408b4c0ed4ed50", features = [
    "transport-tls",
@@ -221,7 +224,7 @@ simd-json = "0.15"
 similar-asserts = "1.6.0"
 smallvec = { version = "1", features = ["serde"] }
 snafu = "0.8"
-sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] }
+sqlparser = { version = "0.59.0", default-features = false, features = ["std", "visitor", "serde"] }
 sqlx = { version = "0.8", default-features = false, features = ["any", "macros", "json", "runtime-tokio-rustls"] }
 strum = { version = "0.27", features = ["derive"] }
 sysinfo = "0.33"
@@ -232,7 +235,7 @@ tokio-rustls = { version = "0.26.2", default-features = false }
 tokio-stream = "0.1"
 tokio-util = { version = "0.7", features = ["io-util", "compat"] }
 toml = "0.8.8"
-tonic = { version = "0.13", features = ["tls-ring", "gzip", "zstd"] }
+tonic = { version = "0.14", features = ["tls-ring", "gzip", "zstd"] }
 tower = "0.5"
 tower-http = "0.6"
 tracing = "0.1"
@@ -320,19 +323,20 @@ git = "https://github.com/GreptimeTeam/greptime-meter.git"
 rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"

 [patch.crates-io]
-datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" }                           # branch = "v0.58.x"
+datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-pg-catalog = { git = "https://github.com/GreptimeTeam/datafusion-postgres.git", rev = "74ac8e2806be6de91ff192b97f64735392539d16" }
+datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7143b2fc4492a7970774583ed0997a459f3e5c05" }
+sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "d7d95a44889e099e32d78e9bad9bc00598faef28" }                           # on branch v0.59.x

 [profile.release]
 debug = 1
--- a/9
+++ b/9
@@ -14,6 +14,7 @@ BUILDX_BUILDER_NAME ?= gtbuilder
 BASE_IMAGE ?= ubuntu
 RUST_TOOLCHAIN ?= $(shell cat rust-toolchain.toml | grep channel | cut -d'"' -f2)
 CARGO_REGISTRY_CACHE ?= ${HOME}/.cargo/registry
+CARGO_GIT_CACHE ?= ${HOME}/.cargo/git
 ARCH := $(shell uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')
 OUTPUT_DIR := $(shell if [ "$(RELEASE)" = "true" ]; then echo "release"; elif [ ! -z "$(CARGO_PROFILE)" ]; then echo "$(CARGO_PROFILE)" ; else echo "debug"; fi)
 SQLNESS_OPTS ?=
@@ -86,7 +87,7 @@ build: ## Build debug version greptime.
 build-by-dev-builder: ## Build greptime by dev-builder.
 	docker run --network=host \
 	${ASSEMBLED_EXTRA_BUILD_ENV} \
-	-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \
+	-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git \
 	-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \
 	make build \
 	CARGO_EXTENSION="${CARGO_EXTENSION}" \
@@ -100,7 +101,7 @@ build-by-dev-builder: ## Build greptime by dev-builder.
 .PHONY: build-android-bin
 build-android-bin: ## Build greptime binary for android.
 	docker run --network=host \
-	-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \
+	-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git \
 	-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-android:${DEV_BUILDER_IMAGE_TAG} \
 	make build \
 	CARGO_EXTENSION="ndk --platform 23 -t aarch64-linux-android" \
@@ -206,7 +207,7 @@ fix-udeps: ## Remove unused dependencies automatically.
 	@cargo udeps --workspace --all-targets --output json > udeps-report.json || true
 	@echo "Removing unused dependencies..."
 	@python3 scripts/fix-udeps.py udeps-report.json
-	
+
 .PHONY: fmt-check
 fmt-check: ## Check code format.
 	cargo fmt --all -- --check
@@ -224,7 +225,7 @@ stop-etcd: ## Stop single node etcd for testing purpose.
 .PHONY: run-it-in-container
 run-it-in-container: start-etcd ## Run integration tests in dev-builder.
 	docker run --network=host \
-	-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v /tmp:/tmp \
+	-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git -v /tmp:/tmp \
 	-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \
 	make test sqlness-test BUILD_JOBS=${BUILD_JOBS}

--- a/cliff.toml
+++ b/cliff.toml
@@ -17,7 +17,7 @@ Release date: {{ timestamp | date(format="%B %d, %Y") }}
 {%- set breakings = commits | filter(attribute="breaking", value=true) -%}
 {%- if breakings | length > 0 %}

-## Breaking changes
+### Breaking changes
    {% for commit in breakings %}
      * {{ commit.github.pr_title }}\
        {% if commit.github.username %} by \
--- a/config/config.md
+++ b/config/config.md
@@ -14,11 +14,12 @@
 | --- | -----| ------- | ----------- |
 | `default_timezone` | String | Unset | The default timezone of the server. |
 | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
+| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
+| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
 | `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
 | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
-| `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
@@ -26,14 +27,12 @@
 | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
 | `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
 | `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
-| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
 | `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
 | `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
 | `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
 | `grpc` | -- | -- | The gRPC server options. |
 | `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
 | `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
-| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
 | `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
 | `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
 | `grpc.tls.mode` | String | `disable` | TLS mode. |
@@ -227,7 +226,8 @@
 | --- | -----| ------- | ----------- |
 | `default_timezone` | String | Unset | The default timezone of the server. |
 | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
-| `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. |
+| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
+| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
@@ -238,7 +238,6 @@
 | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
 | `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
 | `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
-| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
 | `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
 | `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
 | `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
@@ -246,7 +245,6 @@
 | `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
 | `grpc.server_addr` | String | `127.0.0.1:4001` | The address advertised to the metasrv, and used for connections from outside the host.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `grpc.bind_addr`. |
 | `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
-| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
 | `grpc.flight_compression` | String | `arrow_ipc` | Compression mode for frontend side Arrow IPC service. Available options:<br/>- `none`: disable all compression<br/>- `transport`: only enable gRPC transport compression (zstd)<br/>- `arrow_ipc`: only enable Arrow IPC compression (lz4)<br/>- `all`: enable all compression.<br/>Default to `none` |
 | `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
 | `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
@@ -346,10 +344,10 @@
 | `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. |
 | `backend` | String | `etcd_store` | The datastore for meta server.<br/>Available values:<br/>- `etcd_store` (default value)<br/>- `memory_store`<br/>- `postgres_store`<br/>- `mysql_store` |
 | `meta_table_name` | String | `greptime_metakv` | Table name in RDS to store metadata. Effect when using a RDS kvbackend.<br/>**Only used when backend is `postgres_store`.** |
-| `meta_schema_name` | String | `greptime_schema` | Optional PostgreSQL schema for metadata table and election table name qualification.<br/>When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),<br/>set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.<br/>GreptimeDB will NOT create the schema automatically; please ensure it exists or the user has permission.<br/>**Only used when backend is `postgres_store`.** |
+| `meta_schema_name` | String | `greptime_schema` | Optional PostgreSQL schema for metadata table and election table name qualification.<br/>When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),<br/>set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.<br/>**Only used when backend is `postgres_store`.** |
+| `auto_create_schema` | Bool | `true` | Automatically create PostgreSQL schema if it doesn't exist.<br/>When enabled, the system will execute `CREATE SCHEMA IF NOT EXISTS <schema_name>`<br/>before creating metadata tables. This is useful in production environments where<br/>manual schema creation may be restricted.<br/>Default is true.<br/>Note: The PostgreSQL user must have CREATE SCHEMA permission for this to work.<br/>**Only used when backend is `postgres_store`.** |
 | `meta_election_lock_id` | Integer | `1` | Advisory lock id in PostgreSQL for election. Effect when using PostgreSQL as kvbackend<br/>Only used when backend is `postgres_store`. |
 | `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
-| `use_memory_store` | Bool | `false` | Store data in memory. |
 | `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
 | `region_failure_detector_initialization_delay` | String | `10m` | The delay before starting region failure detection.<br/>This delay helps prevent Metasrv from triggering unnecessary region failovers before all Datanodes are fully started.<br/>Especially useful when the cluster is not deployed with GreptimeDB Operator and maintenance mode is not enabled. |
 | `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL.<br/>**This option is not recommended to be set to true, because it may lead to data loss during failover.** |
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -6,9 +6,15 @@ default_timezone = "UTC"
 ## @toml2docs:none-default
 default_column_prefix = "greptime"

-## The maximum in-flight write bytes.
+## Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
+## Set to 0 to disable the limit. Default: "0" (unlimited)
 ## @toml2docs:none-default
-#+ max_in_flight_write_bytes = "500MB"
+#+ max_in_flight_write_bytes = "1GB"
+
+## Policy when write bytes quota is exhausted.
+## Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail"
+## @toml2docs:none-default
+#+ write_bytes_exhausted_policy = "wait"

 ## The runtime options.
 #+ [runtime]
@@ -35,10 +41,6 @@ timeout = "0s"
 ## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
 ## Set to 0 to disable limit.
 body_limit = "64MB"
-## Maximum total memory for all concurrent HTTP request bodies.
-## Set to 0 to disable the limit. Default: "0" (unlimited)
-## @toml2docs:none-default
-#+ max_total_body_memory = "1GB"
 ## HTTP CORS support, it's turned on by default
 ## This allows browser to access http APIs without CORS restrictions
 enable_cors = true
@@ -62,10 +64,6 @@ bind_addr = "127.0.0.1:4001"
 server_addr = "127.0.0.1:4001"
 ## The number of server worker threads.
 runtime_size = 8
-## Maximum total memory for all concurrent gRPC request messages.
-## Set to 0 to disable the limit. Default: "0" (unlimited)
-## @toml2docs:none-default
-#+ max_total_message_memory = "1GB"
 ## Compression mode for frontend side Arrow IPC service. Available options:
 ## - `none`: disable all compression
 ## - `transport`: only enable gRPC transport compression (zstd)
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -34,11 +34,18 @@ meta_table_name = "greptime_metakv"
 ## Optional PostgreSQL schema for metadata table and election table name qualification.
 ## When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),
 ## set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.
-## GreptimeDB will NOT create the schema automatically; please ensure it exists or the user has permission.
 ## **Only used when backend is `postgres_store`.**
-
 meta_schema_name = "greptime_schema"

+## Automatically create PostgreSQL schema if it doesn't exist.
+## When enabled, the system will execute `CREATE SCHEMA IF NOT EXISTS <schema_name>`
+## before creating metadata tables. This is useful in production environments where
+## manual schema creation may be restricted.
+## Default is true.
+## Note: The PostgreSQL user must have CREATE SCHEMA permission for this to work.
+## **Only used when backend is `postgres_store`.**
+auto_create_schema = true
+
 ## Advisory lock id in PostgreSQL for election. Effect when using PostgreSQL as kvbackend
 ## Only used when backend is `postgres_store`.
 meta_election_lock_id = 1
@@ -50,9 +57,6 @@ meta_election_lock_id = 1
 ## For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector".
 selector = "round_robin"

-## Store data in memory.
-use_memory_store = false
-
 ## Whether to enable region failover.
 ## This feature is only available on GreptimeDB running on cluster mode and
 ## - Using Remote WAL
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -6,6 +6,16 @@ default_timezone = "UTC"
 ## @toml2docs:none-default
 default_column_prefix = "greptime"

+## Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
+## Set to 0 to disable the limit. Default: "0" (unlimited)
+## @toml2docs:none-default
+#+ max_in_flight_write_bytes = "1GB"
+
+## Policy when write bytes quota is exhausted.
+## Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail"
+## @toml2docs:none-default
+#+ write_bytes_exhausted_policy = "wait"
+
 ## Initialize all regions in the background during the startup.
 ## By default, it provides services after all regions have been initialized.
 init_regions_in_background = false
@@ -22,10 +32,6 @@ max_concurrent_queries = 0
 ## Enable telemetry to collect anonymous usage data. Enabled by default.
 #+ enable_telemetry = true

-## The maximum in-flight write bytes.
-## @toml2docs:none-default
-#+ max_in_flight_write_bytes = "500MB"
-
 ## The runtime options.
 #+ [runtime]
 ## The number of threads to execute the runtime for global read operations.
@@ -43,10 +49,6 @@ timeout = "0s"
 ## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
 ## Set to 0 to disable limit.
 body_limit = "64MB"
-## Maximum total memory for all concurrent HTTP request bodies.
-## Set to 0 to disable the limit. Default: "0" (unlimited)
-## @toml2docs:none-default
-#+ max_total_body_memory = "1GB"
 ## HTTP CORS support, it's turned on by default
 ## This allows browser to access http APIs without CORS restrictions
 enable_cors = true
@@ -67,10 +69,6 @@ prom_validation_mode = "strict"
 bind_addr = "127.0.0.1:4001"
 ## The number of server worker threads.
 runtime_size = 8
-## Maximum total memory for all concurrent gRPC request messages.
-## Set to 0 to disable the limit. Default: "0" (unlimited)
-## @toml2docs:none-default
-#+ max_total_message_memory = "1GB"
 ## The maximum connection age for gRPC connection.
 ## The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.
 ## Refer to https://grpc.io/docs/guides/keepalive/ for more details.
--- a/cyborg/bin/bump-versions.ts
+++ b/cyborg/bin/bump-versions.ts
@@ -57,6 +57,20 @@ const REPO_CONFIGS: Record<string, RepoConfig> = {
        return ['bump-nightly-version.yml', version];
      }

+      // Check for prerelease versions (e.g., 1.0.0-beta.3, 1.0.0-rc.1)
+      const prereleaseMatch = version.match(/^(\d+)\.(\d+)\.(\d+)-(beta|rc)\.(\d+)$/);
+      if (prereleaseMatch) {
+        const [, major, minor, patch, prereleaseType, prereleaseNum] = prereleaseMatch;
+
+        // If it's beta.1 and patch version is 0, treat as major version
+        if (prereleaseType === 'beta' && prereleaseNum === '1' && patch === '0') {
+          return ['bump-version.yml', `${major}.${minor}`];
+        }
+
+        // Otherwise (beta.x where x > 1, or rc.x), treat as patch version
+        return ['bump-patch-version.yml', version];
+      }
+
      const parts = version.split('.');
      if (parts.length !== 3) {
        throw new Error('Invalid version format');
--- a/docs/rfcs/2025-12-05-vector-index.md
+++ b/docs/rfcs/2025-12-05-vector-index.md
@@ -0,0 +1,94 @@
+---
+Feature Name: Vector Index
+Tracking Issue: TBD
+Date: 2025-12-04
+Author: "TBD"
+---
+
+# Summary
+Introduce a per-SST approximate nearest neighbor (ANN) index for `VECTOR(dim)` columns with a pluggable engine. USearch HNSW is the initial engine, while the design keeps VSAG (default when linked) and future engines selectable at DDL or alter time and encoded in the index metadata. The index is built alongside SST creation and accelerates `ORDER BY vec_*_distance(column, <literal vector>) LIMIT k` queries, falling back to the existing brute-force path when an index is unavailable or ineligible.
+
+# Motivation
+Vector distances are currently computed with nalgebra across all rows (O(N)) before sorting, which does not scale to millions of vectors. An on-disk ANN index with sub-linear search reduces latency and compute cost for common RAG, semantic search, and recommendation workloads without changing SQL.
+
+# Details
+
+## Current Behavior
+`VECTOR(dim)` values are stored as binary blobs. Queries call `vec_cos_distance`/`vec_l2sq_distance`/`vec_dot_product` via nalgebra for every row and then sort; there is no indexing or caching.
+
+## Index Eligibility and Configuration
+Only `VECTOR(dim)` columns can be indexed. A column metadata flag follows the existing column-option pattern with an intentionally small surface area:
+- `engine`: `vsag` (default when the binding is built) or `usearch`. If a configured engine is unavailable at runtime, the builder logs and falls back to `usearch` while leaving the option intact for future rebuilds.
+- `metric`: `cosine` (default), `l2sq`, or `dot`; mismatches with query functions force brute-force execution.
+- `m`: HNSW graph connectivity (higher = denser graph, more memory, better recall), default `16`.
+- `ef_construct`: build-time expansion, default `128`.
+- `ef_search`: query-time expansion, default `64`; engines may clamp values.
+
+Option semantics mirror HNSW defaults so both USearch and VSAG can honor them; engine-specific tunables stay in reserved key-value pairs inside the blob header for forward compatibility.
+
+DDL reuses column extensions similar to inverted/fulltext indexes:
+
+```sql
+CREATE TABLE embeddings (
+  ts TIMESTAMP TIME INDEX,
+  id STRING PRIMARY KEY,
+  vec VECTOR(384) VECTOR INDEX WITH (engine = 'vsag', metric = 'cosine', ef_search = 64)
+);
+```
+
+Altering column options toggles the flag, can switch engines (for example `usearch` -> `vsag`), and triggers rebuilds through the existing alter/compaction flow. Engine choice stays in table metadata and each blob header; new SSTs use the configured engine while older SSTs remain readable under their recorded engine until compaction or a manual rebuild rewrites them.
+
+## Storage and Format
+- One vector index per indexed column per SST, stored as a Puffin blob with type `greptime-vector-index-v1`.
+- Each blob records the engine (`usearch`, `vsag`, future values) and engine parameters in the header so readers can select the matching decoder. Mixed-engine SSTs remain readable because the engine id travels with the blob.
+- USearch uses `f32` vectors and SST row offsets (`u64`) as keys; nulls and `OpType::Delete` rows are skipped. Row ids are the absolute SST ordinal so readers can derive `RowSelection` directly from parquet row group lengths without extra side tables.
+- Blob layout:
+  - Header: version, column id, dimension, engine id, metric, `m`, `ef_construct`, `ef_search`, and reserved engine-specific key-value pairs.
+  - Counts: total rows written and indexed rows.
+  - Payload: USearch binary produced by `save_to_buffer`.
+- An empty index (no eligible vectors) results in no available index entry for that column.
+- `puffin_manager` registers the blob type so caches and readers discover it alongside inverted/fulltext/bloom blobs in the same index file.
+
+## Row Visibility and Duplicates
+- The indexer increments `row_offset` for every incoming row (including skipped/null/delete rows) so offsets stay aligned with parquet ordering across row groups.
+- Only `OpType::Put` rows with the expected dimension are inserted; `OpType::Delete` and malformed rows are skipped but still advance `row_offset`, matching the data plane’s visibility rules.
+- Multiple versions of the same primary key remain in the graph; the read path intersects search hits with the standard mito2 deduplication/visibility pipeline (sequence-aware dedup, delete filtering, projection) before returning results.
+- Searches overfetch beyond `k` to compensate for rows discarded by visibility checks and to avoid reissuing index reads.
+
+## Build Path (mito2 write)
+Extend `sst::index::Indexer` to optionally create a `VectorIndexer` when region metadata marks a column as vector-indexed, mirroring how inverted/fulltext/bloom filters attach to `IndexerBuilderImpl` in `mito2`.
+
+The indexer consumes `Batch`/`RecordBatch` data and shares memory tracking and abort semantics with existing indexers:
+- Maintain a running `row_offset` that follows SST write order and spans row groups so the search result can be turned into `RowSelection`.
+- For each `OpType::Put`, if the vector is non-null and matches the declared dimension, insert into USearch with `row_offset` as the key; otherwise skip.
+- Track memory with existing index build metrics; on failure, abort only the vector index while keeping SST writing unaffected.
+
+Engine selection is table-driven: the builder picks the configured engine (default `vsag`, fallback `usearch` if `vsag` is not compiled in) and dispatches to the matching implementation. Unknown engines skip index build with a warning.
+
+On `finish`, serialize the engine-tagged index into the Puffin writer and record `IndexType::Vector` metadata for the column. `IndexOutput` and `FileMeta::indexes/available_indexes` gain a vector entry so manifest updates and `RegionVersion` surface per-column presence, following patterns used by inverted/fulltext/bloom indexes. Planner/metadata validation ensures that mismatched dimensions only reduce the indexed-row count and do not break reads.
+
+## Read Path (mito2 query)
+A planner rule in `query` identifies eligible plans on mito2 tables: a single `ORDER BY vec_cos_distance|vec_l2sq_distance|vec_dot_product(<vector column>, <literal vector>)` in ascending order plus a `LIMIT`/`TopK`. The rule rejects plans with multiple sort keys, non-literal query vectors, or additional projections that would change the distance expression and falls back to brute-force in those cases.
+
+For eligible scans, build a `VectorIndexScan` execution node that:
+- Consults SST metadata for `IndexType::Vector`, loads the index via Puffin using the existing `mito2::cache::index` infrastructure, and dispatches to the engine declared in the blob header (USearch/VSAG/etc.).
+- Runs the engine’s `search` with an overfetch (for example 2×k) to tolerate rows filtered by deletes, dimension mismatches, or late-stage dedup; keys already match SST row offsets produced by the writer.
+- Converts hits to `RowSelection` using parquet row group lengths and reuses the parquet reader so visibility, projection, and deduplication logic stay unchanged; distances are recomputed with `vec_*_distance` before the final trim to k to guarantee ordering and to merge distributed partial results deterministically.
+
+Any unsupported shape, load error, or cache miss falls back to the current brute-force execution path.
+
+## Lifecycle and Maintenance
+Lifecycle piggybacks on the existing SST/index flow: rebuilds run where other secondary indexes do, graphs are always rebuilt from source rows (no HNSW merge), and cleanup/versioning/caching reuse the existing Puffin and index cache paths.
+
+# Implementation Plan
+1. Add the `usearch` dependency (wrapper module in `index` or `mito2`) and map minimal HNSW options; keep an engine trait that allows plugging VSAG without changing the rest of the pipeline.
+2. Introduce `IndexType::Vector` and a column metadata key for vector index options (including `engine`); add SQL parser and `SHOW CREATE TABLE` support for `VECTOR INDEX WITH (...)`.
+3. Implement `vector_index` build/read modules under `mito2` (and `index` if shared), including Puffin serialization that records engine id, blob-type registration with `puffin_manager`, and integration with the `Indexer` builder, `IndexOutput`, manifest updates, and compaction rebuild.
+4. Extend the query planner/execution to detect eligible plans and drive a `RowSelection`-based ANN scan with a fallback path, dispatching by engine at read time and using existing Puffin and index caches.
+5. Add unit tests for serialization/search correctness and an end-to-end test covering plan rewrite, cache usage, engine selection, and fallback; add a mixed-engine test to confirm old USearch blobs still serve after a VSAG switch.
+6. Follow up with an optional VSAG engine binding (feature flag), validate parity with USearch on dense vectors, exercise alternative algorithms (for example PQ), and flip the default `engine` to `vsag` when the binding is present.
+
+# Alternatives
+- **VSAG (follow-up engine):** C++ library with HNSW and additional algorithms (for example SINDI for sparse vectors and PQ) targeting in-memory and disk-friendly search. Provides parameter generators and a roadmap for GPU-assisted build and graph compression. Compared to FAISS it is newer with fewer integrations but bundles sparse/dense coverage and out-of-core focus in one engine. Fits the pluggable-engine design and would become the default `engine = 'vsag'` when linked; USearch remains available for lighter dependencies.
+- **FAISS:** Broad feature set (IVF/IVFPQ/PQ/HNSW, GPU acceleration, scalar filtering, pre/post filters) and battle-tested performance across datasets, but it requires a heavier C++/GPU toolchain, has no official Rust binding, and is less disk-centric than VSAG; integrating it would add more build/distribution burden than USearch/VSAG.
+- **Do nothing:** Keep brute-force evaluation, which remains O(N) and unacceptable at scale.
--- a/grafana/dashboards/metrics/cluster/dashboard.json
+++ b/grafana/dashboards/metrics/cluster/dashboard.json
@@ -8863,7 +8863,7 @@
        "type": "prometheus",
        "uid": "${metrics}"
      },
-      "description": "Elapsed of Reconciliation steps ",
+      "description": "Elapsed of Reconciliation steps",
      "fieldConfig": {
        "defaults": {
          "color": {
@@ -9366,7 +9366,7 @@
              "editorMode": "code",
              "expr": "greptime_flow_input_buf_size",
              "instant": false,
-              "legendFormat": "[{{instance}}]-[{{pod}]",
+              "legendFormat": "[{{instance}}]-[{{pod}}]",
              "range": true,
              "refId": "A"
            }
@@ -9472,6 +9472,755 @@
      ],
      "title": "Flownode",
      "type": "row"
+    },
+    {
+      "collapsed": true,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 187
+      },
+      "id": 357,
+      "panels": [],
+      "title": "Trigger",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "bf9fzta69bhtsa"
+      },
+      "description": "Total number of triggers currently defined.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 188
+      },
+      "id": 358,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "greptime_trigger_count{}",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Trigger Count",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Elapsed time for trigger evaluation, including query execution and condition evaluation.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 196
+      },
+      "id": 359,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-p99",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.75, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-p75",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Trigger Eval Elapsed",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Rate of failed trigger evaluations.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 196
+      },
+      "id": 360,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "rate(greptime_trigger_evaluate_failure_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Trigger Eval Failure Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Elapsed time to send trigger alerts to notification channels.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 204
+      },
+      "id": 361,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.75, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Send Alert Elapsed",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Rate of failures when sending trigger alerts.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 204
+      },
+      "id": 364,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "rate(greptime_trigger_send_alert_failure_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Send Alert Failure Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Elapsed time to persist trigger alert records.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 212
+      },
+      "id": 363,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.75, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Save Alert Elapsed",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Rate of failures when persisting trigger alert records.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 212
+      },
+      "id": 362,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Save Alert Failure Rate",
+      "type": "timeseries"
    }
  ],
  "preload": false,
@@ -9613,4 +10362,4 @@
  "title": "GreptimeDB",
  "uid": "dejf3k5e7g2kgb",
  "version": 15
-}
+}
--- a/grafana/dashboards/metrics/cluster/dashboard.md
+++ b/grafana/dashboards/metrics/cluster/dashboard.md
@@ -111,12 +111,34 @@
 | Rate of meta KV Ops | `rate(greptime_meta_kv_request_elapsed_count[$__rate_interval])` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `none` | `{{pod}}-{{op}} p99` |
 | DDL Latency | `histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_tables_bucket))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_view))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_flow))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_drop_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_alter_table))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `CreateLogicalTables-{{step}} p90` |
 | Reconciliation stats | `greptime_meta_reconciliation_stats` | `timeseries` | Reconciliation stats | `prometheus` | `s` | `{{pod}}-{{table_type}}-{{type}}` |
-| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps  | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` |
+| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` |
 # Flownode
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
 | Flow Ingest / Output Rate | `sum by(instance, pod, direction) (rate(greptime_flow_processed_rows[$__rate_interval]))` | `timeseries` | Flow Ingest / Output Rate. | `prometheus` | -- | `[{{pod}}]-[{{instance}}]-[{{direction}}]` |
 | Flow Ingest Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))`<br/>`histogram_quantile(0.99, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))` | `timeseries` | Flow Ingest Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-p95` |
 | Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`<br/>`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` |
-| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}]` |
+| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` |
 | Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` |
+# Trigger
+| Title | Query | Type | Description | Datasource | Unit | Legend Format |
+| --- | --- | --- | --- | --- | --- | --- |
+| Trigger Count | `greptime_trigger_count{}` | `timeseries` | Total number of triggers currently defined. | `prometheus` | -- | `__auto` |
+| Trigger Eval Elapsed | `histogram_quantile(0.99, 
+  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])
+)`<br/>`histogram_quantile(0.75, 
+  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])
+)` | `timeseries` | Elapsed time for trigger evaluation, including query execution and condition evaluation. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-p99` |
+| Trigger Eval Failure Rate | `rate(greptime_trigger_evaluate_failure_count[$__rate_interval])` | `timeseries` | Rate of failed trigger evaluations. | `prometheus` | `none` | `__auto` |
+| Send Alert Elapsed | `histogram_quantile(0.99, 
+  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])
+)`<br/>`histogram_quantile(0.75, 
+  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])
+)` | `timeseries` | Elapsed time to send trigger alerts to notification channels. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99` |
+| Send Alert Failure Rate | `rate(greptime_trigger_send_alert_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when sending trigger alerts. | `prometheus` | `none` | `__auto` |
+| Save Alert Elapsed | `histogram_quantile(0.99, 
+  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])
+)`<br/>`histogram_quantile(0.75, 
+  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])
+)` | `timeseries` | Elapsed time to persist trigger alert records. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99` |
+| Save Alert Failure Rate | `rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when persisting trigger alert records. | `prometheus` | `none` | `__auto` |
--- a/grafana/dashboards/metrics/cluster/dashboard.yaml
+++ b/grafana/dashboards/metrics/cluster/dashboard.yaml
@@ -1002,7 +1002,7 @@ groups:
              legendFormat: '{{pod}}-{{table_type}}-{{type}}'
        - title: Reconciliation steps
          type: timeseries
-          description: 'Elapsed of Reconciliation steps '
+          description: Elapsed of Reconciliation steps
          unit: s
          queries:
            - expr: histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)
@@ -1057,7 +1057,7 @@ groups:
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}]'
+              legendFormat: '[{{instance}}]-[{{pod}}]'
        - title: Flow Processing Error per Instance
          type: timeseries
          description: Flow Processing Error per Instance.
@@ -1067,3 +1067,89 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]'
+    - title: Trigger
+      panels:
+        - title: Trigger Count
+          type: timeseries
+          description: Total number of triggers currently defined.
+          queries:
+            - expr: greptime_trigger_count{}
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
+        - title: Trigger Eval Elapsed
+          type: timeseries
+          description: Elapsed time for trigger evaluation, including query execution and condition evaluation.
+          unit: s
+          queries:
+            - expr: "histogram_quantile(0.99, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-p99'
+            - expr: "histogram_quantile(0.75, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-p75'
+        - title: Trigger Eval Failure Rate
+          type: timeseries
+          description: Rate of failed trigger evaluations.
+          unit: none
+          queries:
+            - expr: rate(greptime_trigger_evaluate_failure_count[$__rate_interval])
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
+        - title: Send Alert Elapsed
+          type: timeseries
+          description: Elapsed time to send trigger alerts to notification channels.
+          unit: s
+          queries:
+            - expr: "histogram_quantile(0.99, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99'
+            - expr: "histogram_quantile(0.75, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75'
+        - title: Send Alert Failure Rate
+          type: timeseries
+          description: Rate of failures when sending trigger alerts.
+          unit: none
+          queries:
+            - expr: rate(greptime_trigger_send_alert_failure_count[$__rate_interval])
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
+        - title: Save Alert Elapsed
+          type: timeseries
+          description: Elapsed time to persist trigger alert records.
+          unit: s
+          queries:
+            - expr: "histogram_quantile(0.99, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99'
+            - expr: "histogram_quantile(0.75, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75'
+        - title: Save Alert Failure Rate
+          type: timeseries
+          description: Rate of failures when persisting trigger alert records.
+          unit: none
+          queries:
+            - expr: rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
--- a/grafana/dashboards/metrics/standalone/dashboard.json
+++ b/grafana/dashboards/metrics/standalone/dashboard.json
@@ -8863,7 +8863,7 @@
        "type": "prometheus",
        "uid": "${metrics}"
      },
-      "description": "Elapsed of Reconciliation steps ",
+      "description": "Elapsed of Reconciliation steps",
      "fieldConfig": {
        "defaults": {
          "color": {
@@ -9366,7 +9366,7 @@
              "editorMode": "code",
              "expr": "greptime_flow_input_buf_size",
              "instant": false,
-              "legendFormat": "[{{instance}}]-[{{pod}]",
+              "legendFormat": "[{{instance}}]-[{{pod}}]",
              "range": true,
              "refId": "A"
            }
@@ -9472,6 +9472,755 @@
      ],
      "title": "Flownode",
      "type": "row"
+    },
+    {
+      "collapsed": true,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 187
+      },
+      "id": 357,
+      "panels": [],
+      "title": "Trigger",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "bf9fzta69bhtsa"
+      },
+      "description": "Total number of triggers currently defined.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 188
+      },
+      "id": 358,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "greptime_trigger_count{}",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Trigger Count",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Elapsed time for trigger evaluation, including query execution and condition evaluation.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 196
+      },
+      "id": 359,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-p99",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.75, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-p75",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Trigger Eval Elapsed",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Rate of failed trigger evaluations.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 196
+      },
+      "id": 360,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "rate(greptime_trigger_evaluate_failure_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Trigger Eval Failure Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Elapsed time to send trigger alerts to notification channels.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 204
+      },
+      "id": 361,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.75, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Send Alert Elapsed",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Rate of failures when sending trigger alerts.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 204
+      },
+      "id": 364,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "rate(greptime_trigger_send_alert_failure_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Send Alert Failure Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Elapsed time to persist trigger alert records.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 212
+      },
+      "id": 363,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.75, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Save Alert Elapsed",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${metrics}"
+      },
+      "description": "Rate of failures when persisting trigger alert records.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 212
+      },
+      "id": 362,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${metrics}"
+          },
+          "editorMode": "code",
+          "expr": "rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Save Alert Failure Rate",
+      "type": "timeseries"
    }
  ],
  "preload": false,
@@ -9613,4 +10362,4 @@
  "title": "GreptimeDB",
  "uid": "dejf3k5e7g2kgb",
  "version": 15
-}
+}
--- a/grafana/dashboards/metrics/standalone/dashboard.md
+++ b/grafana/dashboards/metrics/standalone/dashboard.md
@@ -111,12 +111,34 @@
 | Rate of meta KV Ops | `rate(greptime_meta_kv_request_elapsed_count[$__rate_interval])` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `none` | `{{pod}}-{{op}} p99` |
 | DDL Latency | `histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_tables_bucket))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_view))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_flow))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_drop_table))`<br/>`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_alter_table))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `CreateLogicalTables-{{step}} p90` |
 | Reconciliation stats | `greptime_meta_reconciliation_stats` | `timeseries` | Reconciliation stats | `prometheus` | `s` | `{{pod}}-{{table_type}}-{{type}}` |
-| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps  | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` |
+| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` |
 # Flownode
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
 | Flow Ingest / Output Rate | `sum by(instance, pod, direction) (rate(greptime_flow_processed_rows[$__rate_interval]))` | `timeseries` | Flow Ingest / Output Rate. | `prometheus` | -- | `[{{pod}}]-[{{instance}}]-[{{direction}}]` |
 | Flow Ingest Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))`<br/>`histogram_quantile(0.99, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))` | `timeseries` | Flow Ingest Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-p95` |
 | Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`<br/>`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` |
-| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}]` |
+| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` |
 | Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` |
+# Trigger
+| Title | Query | Type | Description | Datasource | Unit | Legend Format |
+| --- | --- | --- | --- | --- | --- | --- |
+| Trigger Count | `greptime_trigger_count{}` | `timeseries` | Total number of triggers currently defined. | `prometheus` | -- | `__auto` |
+| Trigger Eval Elapsed | `histogram_quantile(0.99, 
+  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])
+)`<br/>`histogram_quantile(0.75, 
+  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])
+)` | `timeseries` | Elapsed time for trigger evaluation, including query execution and condition evaluation. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-p99` |
+| Trigger Eval Failure Rate | `rate(greptime_trigger_evaluate_failure_count[$__rate_interval])` | `timeseries` | Rate of failed trigger evaluations. | `prometheus` | `none` | `__auto` |
+| Send Alert Elapsed | `histogram_quantile(0.99, 
+  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])
+)`<br/>`histogram_quantile(0.75, 
+  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])
+)` | `timeseries` | Elapsed time to send trigger alerts to notification channels. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99` |
+| Send Alert Failure Rate | `rate(greptime_trigger_send_alert_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when sending trigger alerts. | `prometheus` | `none` | `__auto` |
+| Save Alert Elapsed | `histogram_quantile(0.99, 
+  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])
+)`<br/>`histogram_quantile(0.75, 
+  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])
+)` | `timeseries` | Elapsed time to persist trigger alert records. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99` |
+| Save Alert Failure Rate | `rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when persisting trigger alert records. | `prometheus` | `none` | `__auto` |
--- a/grafana/dashboards/metrics/standalone/dashboard.yaml
+++ b/grafana/dashboards/metrics/standalone/dashboard.yaml
@@ -1002,7 +1002,7 @@ groups:
              legendFormat: '{{pod}}-{{table_type}}-{{type}}'
        - title: Reconciliation steps
          type: timeseries
-          description: 'Elapsed of Reconciliation steps '
+          description: Elapsed of Reconciliation steps
          unit: s
          queries:
            - expr: histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)
@@ -1057,7 +1057,7 @@ groups:
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}]'
+              legendFormat: '[{{instance}}]-[{{pod}}]'
        - title: Flow Processing Error per Instance
          type: timeseries
          description: Flow Processing Error per Instance.
@@ -1067,3 +1067,89 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]'
+    - title: Trigger
+      panels:
+        - title: Trigger Count
+          type: timeseries
+          description: Total number of triggers currently defined.
+          queries:
+            - expr: greptime_trigger_count{}
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
+        - title: Trigger Eval Elapsed
+          type: timeseries
+          description: Elapsed time for trigger evaluation, including query execution and condition evaluation.
+          unit: s
+          queries:
+            - expr: "histogram_quantile(0.99, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-p99'
+            - expr: "histogram_quantile(0.75, \n  rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-p75'
+        - title: Trigger Eval Failure Rate
+          type: timeseries
+          description: Rate of failed trigger evaluations.
+          unit: none
+          queries:
+            - expr: rate(greptime_trigger_evaluate_failure_count[$__rate_interval])
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
+        - title: Send Alert Elapsed
+          type: timeseries
+          description: Elapsed time to send trigger alerts to notification channels.
+          unit: s
+          queries:
+            - expr: "histogram_quantile(0.99, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99'
+            - expr: "histogram_quantile(0.75, \n  rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75'
+        - title: Send Alert Failure Rate
+          type: timeseries
+          description: Rate of failures when sending trigger alerts.
+          unit: none
+          queries:
+            - expr: rate(greptime_trigger_send_alert_failure_count[$__rate_interval])
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
+        - title: Save Alert Elapsed
+          type: timeseries
+          description: Elapsed time to persist trigger alert records.
+          unit: s
+          queries:
+            - expr: "histogram_quantile(0.99, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99'
+            - expr: "histogram_quantile(0.75, \n  rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)"
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75'
+        - title: Save Alert Failure Rate
+          type: timeseries
+          description: Rate of failures when persisting trigger alert records.
+          unit: none
+          queries:
+            - expr: rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: __auto
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -895,7 +895,7 @@ pub fn is_column_type_value_eq(
        .unwrap_or(false)
 }

-fn encode_json_value(value: JsonValue) -> v1::JsonValue {
+pub fn encode_json_value(value: JsonValue) -> v1::JsonValue {
    fn helper(json: JsonVariant) -> v1::JsonValue {
        let value = match json {
            JsonVariant::Null => None,
--- a/src/api/src/v1/column_def.rs
+++ b/src/api/src/v1/column_def.rs
@@ -17,8 +17,8 @@ use std::collections::HashMap;
 use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};
 use datatypes::schema::{
    COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer,
-    FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions,
-    SkippingIndexType,
+    FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata, SKIPPING_INDEX_KEY,
+    SkippingIndexOptions, SkippingIndexType,
 };
 use greptime_proto::v1::{
    Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType,
@@ -36,6 +36,14 @@ const INVERTED_INDEX_GRPC_KEY: &str = "inverted_index";
 /// Key used to store skip index options in gRPC column options.
 const SKIPPING_INDEX_GRPC_KEY: &str = "skipping_index";

+const COLUMN_OPTION_MAPPINGS: [(&str, &str); 5] = [
+    (FULLTEXT_GRPC_KEY, FULLTEXT_KEY),
+    (INVERTED_INDEX_GRPC_KEY, INVERTED_INDEX_KEY),
+    (SKIPPING_INDEX_GRPC_KEY, SKIPPING_INDEX_KEY),
+    (EXTENSION_TYPE_NAME_KEY, EXTENSION_TYPE_NAME_KEY),
+    (EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_METADATA_KEY),
+];
+
 /// Tries to construct a `ColumnSchema` from the given  `ColumnDef`.
 pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
    let data_type = ColumnDataTypeWrapper::try_new(
@@ -131,6 +139,21 @@ pub fn try_as_column_def(column_schema: &ColumnSchema, is_primary_key: bool) ->
    })
 }

+/// Collect the [ColumnOptions] into the [Metadata] that can be used in, for example, [ColumnSchema].
+pub fn collect_column_options(column_options: Option<&ColumnOptions>) -> Metadata {
+    let Some(ColumnOptions { options }) = column_options else {
+        return Metadata::default();
+    };
+
+    let mut metadata = Metadata::with_capacity(options.len());
+    for (x, y) in COLUMN_OPTION_MAPPINGS {
+        if let Some(v) = options.get(x) {
+            metadata.insert(y.to_string(), v.clone());
+        }
+    }
+    metadata
+}
+
 /// Constructs a `ColumnOptions` from the given `ColumnSchema`.
 pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<ColumnOptions> {
    let mut options = ColumnOptions::default();
--- a/src/catalog/src/lib.rs
+++ b/src/catalog/src/lib.rs
@@ -32,6 +32,7 @@ use crate::error::Result;
 pub mod error;
 pub mod information_extension;
 pub mod kvbackend;
+#[cfg(any(test, feature = "testing"))]
 pub mod memory;
 mod metrics;
 pub mod system_schema;
--- a/src/catalog/src/metrics.rs
+++ b/src/catalog/src/metrics.rs
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-pub(crate) const METRIC_DB_LABEL: &str = "db";
-
 use lazy_static::lazy_static;
 use prometheus::*;

@@ -25,7 +23,7 @@ lazy_static! {
    pub static ref METRIC_CATALOG_MANAGER_TABLE_COUNT: IntGaugeVec = register_int_gauge_vec!(
        "greptime_catalog_table_count",
        "catalog table count",
-        &[METRIC_DB_LABEL]
+        &["db"]
    )
    .unwrap();
    pub static ref METRIC_CATALOG_KV_REMOTE_GET: Histogram =
--- a/src/catalog/src/system_schema.rs
+++ b/src/catalog/src/system_schema.rs
@@ -24,6 +24,7 @@ use std::sync::Arc;

 use common_error::ext::BoxedError;
 use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
+use common_telemetry::tracing::Span;
 use datatypes::schema::SchemaRef;
 use futures_util::StreamExt;
 use snafu::ResultExt;
@@ -163,6 +164,7 @@ impl DataSource for SystemTableDataSource {
            stream: Box::pin(stream),
            output_ordering: None,
            metrics: Default::default(),
+            span: Span::current(),
        };

        Ok(Box::pin(stream))
--- a/src/catalog/src/system_schema/information_schema/columns.rs
+++ b/src/catalog/src/system_schema/information_schema/columns.rs
@@ -399,8 +399,8 @@ impl InformationSchemaColumnsBuilder {
            self.is_nullables.push(Some("No"));
        }
        self.column_types.push(Some(&data_type));
-        self.column_comments
-            .push(column_schema.column_comment().map(|x| x.as_ref()));
+        let column_comment = column_schema.column_comment().map(|x| x.as_ref());
+        self.column_comments.push(column_comment);
    }

    fn finish(&mut self) -> Result<RecordBatch> {
--- a/src/catalog/src/system_schema/information_schema/tables.rs
+++ b/src/catalog/src/system_schema/information_schema/tables.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use core::pin::pin;
 use std::sync::{Arc, Weak};

 use arrow_schema::SchemaRef as ArrowSchemaRef;
@@ -31,15 +32,17 @@ use datatypes::value::Value;
 use datatypes::vectors::{
    StringVectorBuilder, TimestampSecondVectorBuilder, UInt32VectorBuilder, UInt64VectorBuilder,
 };
-use futures::TryStreamExt;
+use futures::StreamExt;
 use snafu::{OptionExt, ResultExt};
-use store_api::storage::{RegionId, ScanRequest, TableId};
+use store_api::storage::{ScanRequest, TableId};
 use table::metadata::{TableInfo, TableType};

 use crate::CatalogManager;
 use crate::error::{
-    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
+    CreateRecordBatchSnafu, FindRegionRoutesSnafu, InternalSnafu, Result,
+    UpgradeWeakCatalogManagerRefSnafu,
 };
+use crate::kvbackend::KvBackendCatalogManager;
 use crate::system_schema::information_schema::{InformationTable, Predicates, TABLES};
 use crate::system_schema::utils;

@@ -247,6 +250,10 @@ impl InformationSchemaTablesBuilder {
            .catalog_manager
            .upgrade()
            .context(UpgradeWeakCatalogManagerRefSnafu)?;
+        let partition_manager = catalog_manager
+            .as_any()
+            .downcast_ref::<KvBackendCatalogManager>()
+            .map(|catalog_manager| catalog_manager.partition_manager());
        let predicates = Predicates::from_scan_request(&request);

        let information_extension = utils::information_extension(&self.catalog_manager)?;
@@ -267,37 +274,59 @@ impl InformationSchemaTablesBuilder {
        };

        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
-            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
+            let table_stream = catalog_manager.tables(&catalog_name, &schema_name, None);

-            while let Some(table) = stream.try_next().await? {
-                let table_info = table.table_info();
+            const BATCH_SIZE: usize = 128;
+            // Split tables into chunks
+            let mut table_chunks = pin!(table_stream.ready_chunks(BATCH_SIZE));

-                // TODO(dennis): make it working for metric engine
-                let table_region_stats =
-                    if table_info.meta.engine == MITO_ENGINE || table_info.is_physical_table() {
-                        table_info
-                            .meta
-                            .region_numbers
-                            .iter()
-                            .map(|n| RegionId::new(table_info.ident.table_id, *n))
-                            .flat_map(|region_id| {
-                                region_stats
-                                    .binary_search_by_key(&region_id, |x| x.id)
-                                    .map(|i| &region_stats[i])
-                            })
-                            .collect::<Vec<_>>()
-                    } else {
-                        vec![]
-                    };
+            while let Some(tables) = table_chunks.next().await {
+                let tables = tables.into_iter().collect::<Result<Vec<_>>>()?;
+                let mito_or_physical_table_ids = tables
+                    .iter()
+                    .filter(|table| {
+                        table.table_info().meta.engine == MITO_ENGINE
+                            || table.table_info().is_physical_table()
+                    })
+                    .map(|table| table.table_info().ident.table_id)
+                    .collect::<Vec<_>>();

-                self.add_table(
-                    &predicates,
-                    &catalog_name,
-                    &schema_name,
-                    table_info,
-                    table.table_type(),
-                    &table_region_stats,
-                );
+                let table_routes = if let Some(partition_manager) = &partition_manager {
+                    partition_manager
+                        .batch_find_region_routes(&mito_or_physical_table_ids)
+                        .await
+                        .context(FindRegionRoutesSnafu)?
+                } else {
+                    mito_or_physical_table_ids
+                        .into_iter()
+                        .map(|id| (id, vec![]))
+                        .collect()
+                };
+
+                for table in tables {
+                    let table_region_stats =
+                        match table_routes.get(&table.table_info().ident.table_id) {
+                            Some(routes) => routes
+                                .iter()
+                                .flat_map(|route| {
+                                    let region_id = route.region.id;
+                                    region_stats
+                                        .binary_search_by_key(&region_id, |x| x.id)
+                                        .map(|i| &region_stats[i])
+                                })
+                                .collect::<Vec<_>>(),
+                            None => vec![],
+                        };
+
+                    self.add_table(
+                        &predicates,
+                        &catalog_name,
+                        &schema_name,
+                        table.table_info(),
+                        table.table_type(),
+                        &table_region_stats,
+                    );
+                }
            }
        }

--- a/src/catalog/src/table_source.rs
+++ b/src/catalog/src/table_source.rs
@@ -337,7 +337,7 @@ mod tests {
        .build();

        let table_metadata_manager = TableMetadataManager::new(backend);
-        let mut view_info = common_meta::key::test_utils::new_test_table_info(1024, vec![]);
+        let mut view_info = common_meta::key::test_utils::new_test_table_info(1024);
        view_info.table_type = TableType::View;
        let logical_plan = vec![1, 2, 3];
        // Create view metadata
--- a/src/cli/Cargo.toml
+++ b/src/cli/Cargo.toml
@@ -60,6 +60,7 @@ serde_json.workspace = true
 servers.workspace = true
 session.workspace = true
 snafu.workspace = true
+standalone.workspace = true
 store-api.workspace = true
 table.workspace = true
 tokio.workspace = true
--- a/src/cli/src/bench.rs
+++ b/src/cli/src/bench.rs
@@ -162,7 +162,6 @@ fn create_table_info(table_id: TableId, table_name: TableName) -> RawTableInfo {
        next_column_id: columns as u32 + 1,
        value_indices: vec![],
        options: Default::default(),
-        region_numbers: (1..=100).collect(),
        partition_key_indices: vec![],
        column_ids: vec![],
    };
--- a/src/cli/src/common/object_store.rs
+++ b/src/cli/src/common/object_store.rs
@@ -267,8 +267,6 @@ impl PrefixedS3Connection {
            name: "S3",
            required: [
                (&self.s3_bucket, "bucket"),
-                (&self.s3_access_key_id, "access key ID"),
-                (&self.s3_secret_access_key, "secret access key"),
                (&self.s3_region, "region"),
            ]
        )
--- a/src/cli/src/common/store.rs
+++ b/src/cli/src/common/store.rs
@@ -14,16 +14,38 @@

 use std::sync::Arc;

-use clap::Parser;
+use clap::{Parser, ValueEnum};
 use common_error::ext::BoxedError;
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::kv_backend::chroot::ChrootKvBackend;
 use common_meta::kv_backend::etcd::EtcdStore;
-use meta_srv::metasrv::{BackendClientOptions, BackendImpl};
+use meta_srv::metasrv::BackendClientOptions;
 use meta_srv::utils::etcd::create_etcd_client_with_tls;
+use serde::{Deserialize, Serialize};
 use servers::tls::{TlsMode, TlsOption};
+use snafu::OptionExt;

-use crate::error::EmptyStoreAddrsSnafu;
+use crate::error::{EmptyStoreAddrsSnafu, InvalidArgumentsSnafu};
+
+// The datastores that implements metadata kvbackend.
+#[derive(Clone, Debug, PartialEq, Serialize, Default, Deserialize, ValueEnum)]
+#[serde(rename_all = "snake_case")]
+#[allow(clippy::enum_variant_names)]
+pub enum BackendImpl {
+    // Etcd as metadata storage.
+    #[default]
+    EtcdStore,
+    // In memory metadata storage - mostly used for testing.
+    MemoryStore,
+    #[cfg(feature = "pg_kvbackend")]
+    // Postgres as metadata storage.
+    PostgresStore,
+    #[cfg(feature = "mysql_kvbackend")]
+    // MySql as metadata storage.
+    MysqlStore,
+    // RaftEngine as metadata storage.
+    RaftEngineStore,
+}

 #[derive(Debug, Default, Parser)]
 pub struct StoreConfig {
@@ -61,6 +83,12 @@ pub struct StoreConfig {
    #[cfg(feature = "pg_kvbackend")]
    #[clap(long)]
    pub meta_schema_name: Option<String>,
+
+    /// Automatically create PostgreSQL schema if it doesn't exist (default: true).
+    #[cfg(feature = "pg_kvbackend")]
+    #[clap(long, default_value_t = true)]
+    pub auto_create_schema: bool,
+
    /// TLS mode for backend store connections (etcd, PostgreSQL, MySQL)
    #[clap(long = "backend-tls-mode", value_enum, default_value = "disable")]
    pub backend_tls_mode: TlsMode,
@@ -86,7 +114,7 @@ impl StoreConfig {
    pub fn tls_config(&self) -> Option<TlsOption> {
        if self.backend_tls_mode != TlsMode::Disable {
            Some(TlsOption {
-                mode: self.backend_tls_mode.clone(),
+                mode: self.backend_tls_mode,
                cert_path: self.backend_tls_cert_path.clone(),
                key_path: self.backend_tls_key_path.clone(),
                ca_cert_path: self.backend_tls_ca_cert_path.clone(),
@@ -138,6 +166,7 @@ impl StoreConfig {
                        schema_name,
                        table_name,
                        max_txn_ops,
+                        self.auto_create_schema,
                    )
                    .await
                    .map_err(BoxedError::new)?)
@@ -172,6 +201,18 @@ impl StoreConfig {

                    Ok(Arc::new(MemoryKvBackend::default()) as _)
                }
+                BackendImpl::RaftEngineStore => {
+                    let url = store_addrs
+                        .first()
+                        .context(InvalidArgumentsSnafu {
+                            msg: "empty store addresses".to_string(),
+                        })
+                        .map_err(BoxedError::new)?;
+                    let kvbackend =
+                        standalone::build_metadata_kv_from_url(url).map_err(BoxedError::new)?;
+
+                    Ok(kvbackend)
+                }
            };
            if self.store_key_prefix.is_empty() {
                kvbackend
--- a/src/cli/src/data/export.rs
+++ b/src/cli/src/data/export.rs
@@ -900,67 +900,6 @@ mod tests {

    // ==================== Gap 2: Empty string vs missing tests ====================

-    #[tokio::test]
-    async fn test_export_command_build_with_s3_empty_access_key() {
-        // Test S3 with empty access key ID (empty string, not missing)
-        let cmd = ExportCommand::parse_from([
-            "export",
-            "--addr",
-            "127.0.0.1:4000",
-            "--s3",
-            "--s3-bucket",
-            "test-bucket",
-            "--s3-root",
-            "test-root",
-            "--s3-access-key-id",
-            "", // Empty string
-            "--s3-secret-access-key",
-            "test-secret",
-            "--s3-region",
-            "us-west-2",
-        ]);
-
-        let result = cmd.build().await;
-        assert!(result.is_err());
-        if let Err(err) = result {
-            assert!(
-                err.to_string().contains("S3 access key ID must be set"),
-                "Actual error: {}",
-                err
-            );
-        }
-    }
-
-    #[tokio::test]
-    async fn test_export_command_build_with_s3_missing_secret_key() {
-        // Test S3 with empty secret access key
-        let cmd = ExportCommand::parse_from([
-            "export",
-            "--addr",
-            "127.0.0.1:4000",
-            "--s3",
-            "--s3-bucket",
-            "test-bucket",
-            "--s3-root",
-            "test-root",
-            "--s3-access-key-id",
-            "test-key",
-            // Missing --s3-secret-access-key
-            "--s3-region",
-            "us-west-2",
-        ]);
-
-        let result = cmd.build().await;
-        assert!(result.is_err());
-        if let Err(err) = result {
-            assert!(
-                err.to_string().contains("S3 secret access key must be set"),
-                "Actual error: {}",
-                err
-            );
-        }
-    }
-
    #[tokio::test]
    async fn test_export_command_build_with_s3_empty_root() {
        // Empty root should be allowed (it's optional path component)
--- a/src/cli/src/error.rs
+++ b/src/cli/src/error.rs
@@ -68,8 +68,8 @@ pub enum Error {
        source: common_procedure::error::Error,
    },

-    #[snafu(display("Failed to start wal options allocator"))]
-    StartWalOptionsAllocator {
+    #[snafu(display("Failed to start wal provider"))]
+    StartWalProvider {
        #[snafu(implicit)]
        location: Location,
        source: common_meta::error::Error,
@@ -343,7 +343,7 @@ impl ErrorExt for Error {

            Error::StartProcedureManager { source, .. }
            | Error::StopProcedureManager { source, .. } => source.status_code(),
-            Error::StartWalOptionsAllocator { source, .. } => source.status_code(),
+            Error::StartWalProvider { source, .. } => source.status_code(),
            Error::HttpQuerySql { .. } => StatusCode::Internal,
            Error::ParseSql { source, .. } | Error::PlanStatement { source, .. } => {
                source.status_code()
--- a/src/cli/src/metadata/snapshot.rs
+++ b/src/cli/src/metadata/snapshot.rs
@@ -288,9 +288,16 @@ fn build_object_store_and_resolve_file_path(
 #[cfg(test)]
 mod tests {
    use std::env;
+    use std::sync::Arc;
+    use std::time::Duration;

    use clap::Parser;
+    use common_meta::kv_backend::KvBackend;
+    use common_meta::kv_backend::memory::MemoryKvBackend;
+    use common_meta::rpc::store::PutRequest;
+    use object_store::ObjectStore;

+    use super::*;
    use crate::metadata::snapshot::RestoreCommand;

    #[tokio::test]
@@ -334,4 +341,97 @@ mod tests {
        let tool = cmd.build().await.unwrap();
        assert_eq!(tool.file_path, file_path.to_string_lossy().to_string());
    }
+
+    async fn setup_backup_file(object_store: ObjectStore, file_path: &str) {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let manager = MetadataSnapshotManager::new(kv_backend.clone(), object_store);
+        // Put some data into the kv backend
+        kv_backend
+            .put(
+                PutRequest::new()
+                    .with_key(b"test".to_vec())
+                    .with_value(b"test".to_vec()),
+            )
+            .await
+            .unwrap();
+        manager.dump(file_path).await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_restore_raft_engine_store() {
+        common_telemetry::init_default_ut_logging();
+        let temp_dir = tempfile::tempdir().unwrap();
+        let root = temp_dir.path().display().to_string();
+        let object_store = new_fs_object_store(&root).unwrap();
+        setup_backup_file(object_store, "/backup/metadata_snapshot.metadata.fb").await;
+
+        {
+            let cmd = RestoreCommand::parse_from([
+                "",
+                "--file_name",
+                format!("{}/backup/metadata_snapshot.metadata.fb", root).as_str(),
+                "--backend",
+                "raft-engine-store",
+                "--store-addrs",
+                format!("raftengine:///{}/metadata", root).as_str(),
+            ]);
+            let tool = cmd.build().await.unwrap();
+            tool.do_work().await.unwrap();
+        }
+        // Waits for the raft engine release the file lock.
+        tokio::time::sleep(Duration::from_secs(1)).await;
+        let kv =
+            standalone::build_metadata_kvbackend(format!("{}/metadata", root), Default::default())
+                .unwrap();
+
+        let value = kv.get(b"test").await.unwrap().unwrap().value;
+        assert_eq!(value, b"test");
+    }
+
+    #[tokio::test]
+    async fn test_save_raft_engine_store() {
+        common_telemetry::init_default_ut_logging();
+        let temp_dir = tempfile::tempdir().unwrap();
+        let root = temp_dir.path().display().to_string();
+        {
+            let kv = standalone::build_metadata_kvbackend(
+                format!("{}/metadata", root),
+                Default::default(),
+            )
+            .unwrap();
+            kv.put(
+                PutRequest::new()
+                    .with_key(b"test".to_vec())
+                    .with_value(b"test".to_vec()),
+            )
+            .await
+            .unwrap();
+        }
+        // Waits for the raft engine release the file lock.
+        tokio::time::sleep(Duration::from_secs(1)).await;
+        {
+            let cmd = SaveCommand::parse_from([
+                "",
+                "--file_name",
+                format!("{}/backup/metadata_snapshot.metadata.fb", root).as_str(),
+                "--backend",
+                "raft-engine-store",
+                "--store-addrs",
+                format!("raftengine:///{}/metadata", root).as_str(),
+            ]);
+            let tool = cmd.build().await.unwrap();
+            tool.do_work().await.unwrap();
+        }
+
+        // Reads the snapshot file from the object store.
+        let object_store = new_fs_object_store(&root).unwrap();
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let manager = MetadataSnapshotManager::new(kv_backend.clone(), object_store);
+        manager
+            .restore("/backup/metadata_snapshot.metadata.fb")
+            .await
+            .unwrap();
+        let value = kv_backend.get(b"test").await.unwrap().unwrap().value;
+        assert_eq!(value, b"test");
+    }
 }
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -37,6 +37,7 @@ use common_grpc::flight::{FlightDecoder, FlightMessage};
 use common_query::Output;
 use common_recordbatch::error::ExternalSnafu;
 use common_recordbatch::{RecordBatch, RecordBatchStreamWrapper};
+use common_telemetry::tracing::Span;
 use common_telemetry::tracing_context::W3cTrace;
 use common_telemetry::{error, warn};
 use futures::future;
@@ -456,6 +457,7 @@ impl Database {
                    stream,
                    output_ordering: None,
                    metrics: Default::default(),
+                    span: Span::current(),
                };
                Ok(Output::new_with_stream(Box::pin(record_batch_stream)))
            }
--- a/src/client/src/region.rs
+++ b/src/client/src/region.rs
@@ -30,6 +30,7 @@ use common_query::request::QueryRequest;
 use common_recordbatch::error::ExternalSnafu;
 use common_recordbatch::{RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream};
 use common_telemetry::error;
+use common_telemetry::tracing::Span;
 use common_telemetry::tracing_context::TracingContext;
 use prost::Message;
 use query::query_engine::DefaultSerializer;
@@ -242,6 +243,7 @@ impl RegionRequester {
            stream,
            output_ordering: None,
            metrics,
+            span: Span::current(),
        };
        Ok(Box::pin(record_batch_stream))
    }
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -18,6 +18,7 @@ default = [
 ]
 enterprise = ["common-meta/enterprise", "frontend/enterprise", "meta-srv/enterprise"]
 tokio-console = ["common-telemetry/tokio-console"]
+vector_index = ["mito2/vector_index"]

 [lints]
 workspace = true
--- a/src/cmd/src/datanode.rs
+++ b/src/cmd/src/datanode.rs
@@ -330,7 +330,6 @@ mod tests {
    use common_config::ENV_VAR_SEP;
    use common_test_util::temp_dir::create_named_temp_file;
    use object_store::config::{FileConfig, GcsConfig, ObjectStoreConfig, S3Config};
-    use servers::heartbeat_options::HeartbeatOptions;

    use super::*;
    use crate::options::GlobalOptions;
@@ -374,9 +373,6 @@ mod tests {
            hostname = "127.0.0.1"
            runtime_size = 8

-            [heartbeat]
-            interval = "300ms"
-
            [meta_client]
            metasrv_addrs = ["127.0.0.1:3002"]
            timeout = "3s"
@@ -434,13 +430,6 @@ mod tests {
        );
        assert!(!raft_engine_config.sync_write);

-        let HeartbeatOptions {
-            interval: heart_beat_interval,
-            ..
-        } = options.heartbeat;
-
-        assert_eq!(300, heart_beat_interval.as_millis());
-
        let MetaClientOptions {
            metasrv_addrs: metasrv_addr,
            timeout,
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -35,6 +35,7 @@ use mito2::sst::parquet::reader::ParquetReaderBuilder;
 use mito2::sst::parquet::{PARQUET_METADATA_KEY, WriteOptions};
 use mito2::worker::write_cache_from_config;
 use object_store::ObjectStore;
+use parquet::file::metadata::{FooterTail, KeyValue};
 use regex::Regex;
 use snafu::OptionExt;
 use store_api::metadata::{RegionMetadata, RegionMetadataRef};
@@ -233,6 +234,8 @@ impl ObjbenchCommand {
            inverted_index_config: MitoConfig::default().inverted_index,
            fulltext_index_config,
            bloom_filter_index_config: MitoConfig::default().bloom_filter_index,
+            #[cfg(feature = "vector_index")]
+            vector_index_config: Default::default(),
        };

        // Write SST
@@ -461,7 +464,6 @@ fn extract_region_metadata(
    file_path: &str,
    meta: &parquet::file::metadata::ParquetMetaData,
 ) -> error::Result<RegionMetadataRef> {
-    use parquet::format::KeyValue;
    let kvs: Option<&Vec<KeyValue>> = meta.file_metadata().key_value_metadata();
    let Some(kvs) = kvs else {
        return Err(error::IllegalConfigSnafu {
@@ -606,7 +608,7 @@ async fn load_parquet_metadata(
    let buffer_len = buffer.len();
    let mut footer = [0; 8];
    footer.copy_from_slice(&buffer[buffer_len - FOOTER_SIZE..]);
-    let footer = ParquetMetaDataReader::decode_footer_tail(&footer)?;
+    let footer = FooterTail::try_new(&footer)?;
    let metadata_len = footer.metadata_length() as u64;
    if actual_size - (FOOTER_SIZE as u64) < metadata_len {
        return Err("invalid footer/metadata length".into());
--- a/src/cmd/src/error.rs
+++ b/src/cmd/src/error.rs
@@ -64,8 +64,8 @@ pub enum Error {
        source: common_procedure::error::Error,
    },

-    #[snafu(display("Failed to start wal options allocator"))]
-    StartWalOptionsAllocator {
+    #[snafu(display("Failed to start wal provider"))]
+    StartWalProvider {
        #[snafu(implicit)]
        location: Location,
        source: common_meta::error::Error,
@@ -289,8 +289,8 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to build wal options allocator"))]
-    BuildWalOptionsAllocator {
+    #[snafu(display("Failed to build wal provider"))]
+    BuildWalProvider {
        #[snafu(implicit)]
        location: Location,
        source: common_meta::error::Error,
@@ -350,8 +350,9 @@ impl ErrorExt for Error {

            Error::StartProcedureManager { source, .. }
            | Error::StopProcedureManager { source, .. } => source.status_code(),
-            Error::BuildWalOptionsAllocator { source, .. }
-            | Error::StartWalOptionsAllocator { source, .. } => source.status_code(),
+            Error::BuildWalProvider { source, .. } | Error::StartWalProvider { source, .. } => {
+                source.status_code()
+            }
            Error::HttpQuerySql { .. } => StatusCode::Internal,
            Error::ParseSql { source, .. } | Error::PlanStatement { source, .. } => {
                source.status_code()
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -358,7 +358,6 @@ impl StartCommand {
        let heartbeat_task = flow::heartbeat::HeartbeatTask::new(
            &opts,
            meta_client.clone(),
-            opts.heartbeat.clone(),
            Arc::new(executor),
            Arc::new(resource_stat),
        );
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -20,6 +20,7 @@ use std::time::Duration;
 use async_trait::async_trait;
 use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
 use catalog::information_extension::DistributedInformationExtension;
+use catalog::information_schema::InformationExtensionRef;
 use catalog::kvbackend::{
    CachedKvBackendBuilder, CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder,
    MetaKvBackend,
@@ -236,7 +237,7 @@ impl StartCommand {
        };

        let tls_opts = TlsOption::new(
-            self.tls_mode.clone(),
+            self.tls_mode,
            self.tls_cert_path.clone(),
            self.tls_key_path.clone(),
            self.tls_watch,
@@ -412,6 +413,7 @@ impl StartCommand {
            meta_client.clone(),
            client.clone(),
        ));
+        plugins.insert::<InformationExtensionRef>(information_extension.clone());

        let process_manager = Arc::new(ProcessManager::new(
            addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr)),
--- a/src/cmd/src/lib.rs
+++ b/src/cmd/src/lib.rs
@@ -108,7 +108,7 @@ pub trait App: Send {
    }
 }

-/// Log the versions of the application, and the arguments passed to the cli.
+/// Log the versions of the application.
 ///
 /// `version` should be the same as the output of cli "--version";
 /// and the `short_version` is the short version of the codes, often consist of git branch and commit.
@@ -118,10 +118,7 @@ pub fn log_versions(version: &str, short_version: &str, app: &str) {
        .with_label_values(&[common_version::version(), short_version, app])
        .inc();

-    // Log version and argument flags.
    info!("GreptimeDB version: {}", version);
-
-    log_env_flags();
 }

 pub fn create_resource_limit_metrics(app: &str) {
@@ -144,13 +141,6 @@ pub fn create_resource_limit_metrics(app: &str) {
    }
 }

-fn log_env_flags() {
-    info!("command line arguments");
-    for argument in std::env::args() {
-        info!("argument: {}", argument);
-    }
-}
-
 pub fn maybe_activate_heap_profile(memory_options: &common_options::memory::MemoryOptions) {
    if memory_options.enable_heap_profiling {
        match activate_heap_profile() {
--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -155,8 +155,6 @@ pub struct StartCommand {
    #[clap(short, long)]
    selector: Option<String>,
    #[clap(long)]
-    use_memory_store: Option<bool>,
-    #[clap(long)]
    enable_region_failover: Option<bool>,
    #[clap(long)]
    http_addr: Option<String>,
@@ -186,7 +184,6 @@ impl Debug for StartCommand {
            .field("store_addrs", &self.sanitize_store_addrs())
            .field("config_file", &self.config_file)
            .field("selector", &self.selector)
-            .field("use_memory_store", &self.use_memory_store)
            .field("enable_region_failover", &self.enable_region_failover)
            .field("http_addr", &self.http_addr)
            .field("http_timeout", &self.http_timeout)
@@ -268,10 +265,6 @@ impl StartCommand {
                .context(error::UnsupportedSelectorTypeSnafu { selector_type })?;
        }

-        if let Some(use_memory_store) = self.use_memory_store {
-            opts.use_memory_store = use_memory_store;
-        }
-
        if let Some(enable_region_failover) = self.enable_region_failover {
            opts.enable_region_failover = enable_region_failover;
        }
@@ -391,7 +384,6 @@ mod tests {
            server_addr = "127.0.0.1:3002"
            store_addr = "127.0.0.1:2379"
            selector = "LeaseBased"
-            use_memory_store = false

            [logging]
            level = "debug"
@@ -470,7 +462,6 @@ mod tests {
            server_addr = "127.0.0.1:3002"
            datanode_lease_secs = 15
            selector = "LeaseBased"
-            use_memory_store = false

            [http]
            addr = "127.0.0.1:4000"
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -40,7 +40,7 @@ use common_meta::procedure_executor::LocalProcedureExecutor;
 use common_meta::region_keeper::MemoryRegionKeeper;
 use common_meta::region_registry::LeaderRegionRegistry;
 use common_meta::sequence::SequenceBuilder;
-use common_meta::wal_options_allocator::{WalOptionsAllocatorRef, build_wal_options_allocator};
+use common_meta::wal_provider::{WalProviderRef, build_wal_provider};
 use common_procedure::ProcedureManagerRef;
 use common_query::prelude::set_default_prefix;
 use common_telemetry::info;
@@ -64,8 +64,8 @@ use plugins::frontend::context::{
 use plugins::standalone::context::DdlManagerConfigureContext;
 use servers::tls::{TlsMode, TlsOption, merge_tls_option};
 use snafu::ResultExt;
-use standalone::StandaloneInformationExtension;
 use standalone::options::StandaloneOptions;
+use standalone::{StandaloneInformationExtension, StandaloneRepartitionProcedureFactory};
 use tracing_appender::non_blocking::WorkerGuard;

 use crate::error::{OtherSnafu, Result, StartFlownodeSnafu};
@@ -120,7 +120,7 @@ pub struct Instance {
    frontend: Frontend,
    flownode: FlownodeInstance,
    procedure_manager: ProcedureManagerRef,
-    wal_options_allocator: WalOptionsAllocatorRef,
+    wal_provider: WalProviderRef,
    // Keep the logging guard to prevent the worker from being dropped.
    _guard: Vec<WorkerGuard>,
 }
@@ -146,10 +146,10 @@ impl App for Instance {
            .await
            .context(error::StartProcedureManagerSnafu)?;

-        self.wal_options_allocator
+        self.wal_provider
            .start()
            .await
-            .context(error::StartWalOptionsAllocatorSnafu)?;
+            .context(error::StartWalProviderSnafu)?;

        plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
            .await
@@ -261,7 +261,7 @@ impl StartCommand {
        };

        let tls_opts = TlsOption::new(
-            self.tls_mode.clone(),
+            self.tls_mode,
            self.tls_cert_path.clone(),
            self.tls_key_path.clone(),
            self.tls_watch,
@@ -468,7 +468,7 @@ impl StartCommand {
            flow_server: flownode.flow_engine(),
        });

-        let table_id_sequence = Arc::new(
+        let table_id_allocator = Arc::new(
            SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone())
                .initial(MIN_USER_TABLE_ID as u64)
                .step(10)
@@ -485,13 +485,13 @@ impl StartCommand {
            .clone()
            .try_into()
            .context(error::InvalidWalProviderSnafu)?;
-        let wal_options_allocator = build_wal_options_allocator(&kafka_options, kv_backend.clone())
+        let wal_provider = build_wal_provider(&kafka_options, kv_backend.clone())
            .await
-            .context(error::BuildWalOptionsAllocatorSnafu)?;
-        let wal_options_allocator = Arc::new(wal_options_allocator);
+            .context(error::BuildWalProviderSnafu)?;
+        let wal_provider = Arc::new(wal_provider);
        let table_metadata_allocator = Arc::new(TableMetadataAllocator::new(
-            table_id_sequence,
-            wal_options_allocator.clone(),
+            table_id_allocator,
+            wal_provider.clone(),
        ));
        let flow_metadata_allocator = Arc::new(FlowMetadataAllocator::with_noop_peer_allocator(
            flow_id_sequence,
@@ -509,8 +509,13 @@ impl StartCommand {
            region_failure_detector_controller: Arc::new(NoopRegionFailureDetectorControl),
        };

-        let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager.clone(), true)
-            .context(error::InitDdlManagerSnafu)?;
+        let ddl_manager = DdlManager::try_new(
+            ddl_context,
+            procedure_manager.clone(),
+            Arc::new(StandaloneRepartitionProcedureFactory),
+            true,
+        )
+        .context(error::InitDdlManagerSnafu)?;

        let ddl_manager = if let Some(configurator) =
            plugins.get::<DdlManagerConfiguratorRef<DdlManagerConfigureContext>>()
@@ -585,7 +590,7 @@ impl StartCommand {
            frontend,
            flownode,
            procedure_manager,
-            wal_options_allocator,
+            wal_provider,
            _guard: guard,
        })
    }
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -228,7 +228,6 @@ fn test_load_flownode_example_config() {
                ..Default::default()
            },
            tracing: Default::default(),
-            heartbeat: Default::default(),
            // flownode deliberately use a slower query parallelism
            // to avoid overwhelming the frontend with too many queries
            query: QueryOptions {
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -27,13 +27,14 @@ common-recordbatch.workspace = true
 common-runtime.workspace = true
 common-telemetry.workspace = true
 datafusion.workspace = true
+datafusion-datasource.workspace = true
 datafusion-orc.workspace = true
 datatypes.workspace = true
 futures.workspace = true
 lazy_static.workspace = true
 object-store.workspace = true
 object_store_opendal.workspace = true
-orc-rust = { version = "0.6.3", default-features = false, features = ["async"] }
+orc-rust = { version = "0.7", default-features = false, features = ["async"] }
 parquet.workspace = true
 paste.workspace = true
 regex.workspace = true
@@ -42,7 +43,7 @@ snafu.workspace = true
 strum.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
-url = "2.3"
+url.workspace = true

 [dev-dependencies]
 common-test-util.workspace = true
--- a/src/common/datasource/src/buffered_writer.rs
+++ b/src/common/datasource/src/buffered_writer.rs
@@ -14,7 +14,7 @@

 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
-use datafusion::parquet::format::FileMetaData;
+use parquet::file::metadata::ParquetMetaData;

 use crate::error::Result;

@@ -24,5 +24,5 @@ pub trait DfRecordBatchEncoder {

 #[async_trait]
 pub trait ArrowWriterCloser {
-    async fn close(mut self) -> Result<FileMetaData>;
+    async fn close(mut self) -> Result<ParquetMetaData>;
 }
--- a/src/common/datasource/src/file_format.rs
+++ b/src/common/datasource/src/file_format.rs
@@ -40,7 +40,6 @@ use datafusion::datasource::physical_plan::{
 use datafusion::error::{DataFusionError, Result as DataFusionResult};
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
-use datatypes::arrow::datatypes::SchemaRef;
 use futures::{StreamExt, TryStreamExt};
 use object_store::ObjectStore;
 use object_store_opendal::OpendalStore;
@@ -303,24 +302,20 @@ where
 pub async fn file_to_stream(
    store: &ObjectStore,
    filename: &str,
-    file_schema: SchemaRef,
    file_source: Arc<dyn FileSource>,
    projection: Option<Vec<usize>>,
    compression_type: CompressionType,
 ) -> Result<DfSendableRecordBatchStream> {
    let df_compression: DfCompressionType = compression_type.into();
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        file_source.clone(),
-    )
-    .with_file_group(FileGroup::new(vec![PartitionedFile::new(
-        filename.to_string(),
-        0,
-    )]))
-    .with_projection(projection)
-    .with_file_compression_type(df_compression)
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source.clone())
+            .with_file_group(FileGroup::new(vec![PartitionedFile::new(
+                filename.to_string(),
+                0,
+            )]))
+            .with_projection_indices(projection)
+            .with_file_compression_type(df_compression)
+            .build();

    let store = Arc::new(OpendalStore::new(store.clone()));
    let file_opener = file_source
--- a/src/common/datasource/src/file_format/csv.rs
+++ b/src/common/datasource/src/file_format/csv.rs
@@ -440,14 +440,11 @@ mod tests {
                .await
                .unwrap(),
            );
-            let csv_source = CsvSource::new(true, b',', b'"')
-                .with_schema(schema.clone())
-                .with_batch_size(8192);
+            let csv_source = CsvSource::new(schema).with_batch_size(8192);

            let stream = file_to_stream(
                &store,
                compressed_file_path_str,
-                schema.clone(),
                csv_source.clone(),
                None,
                compression_type,
--- a/src/common/datasource/src/file_format/json.rs
+++ b/src/common/datasource/src/file_format/json.rs
@@ -347,14 +347,11 @@ mod tests {
                .await
                .unwrap(),
            );
-            let json_source = JsonSource::new()
-                .with_schema(schema.clone())
-                .with_batch_size(8192);
+            let json_source = JsonSource::new(schema).with_batch_size(8192);

            let stream = file_to_stream(
                &store,
                compressed_file_path_str,
-                schema.clone(),
                json_source.clone(),
                None,
                compression_type,
--- a/src/common/datasource/src/file_format/parquet.rs
+++ b/src/common/datasource/src/file_format/parquet.rs
@@ -18,15 +18,15 @@ use std::sync::Arc;
 use arrow::record_batch::RecordBatch;
 use arrow_schema::Schema;
 use async_trait::async_trait;
-use datafusion::datasource::physical_plan::{FileMeta, ParquetFileReaderFactory};
+use datafusion::datasource::physical_plan::ParquetFileReaderFactory;
 use datafusion::error::Result as DatafusionResult;
 use datafusion::parquet::arrow::async_reader::AsyncFileReader;
 use datafusion::parquet::arrow::{ArrowWriter, parquet_to_arrow_schema};
 use datafusion::parquet::errors::{ParquetError, Result as ParquetResult};
 use datafusion::parquet::file::metadata::ParquetMetaData;
-use datafusion::parquet::format::FileMetaData;
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_datasource::PartitionedFile;
 use datatypes::schema::SchemaRef;
 use futures::StreamExt;
 use futures::future::BoxFuture;
@@ -100,11 +100,11 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
    fn create_reader(
        &self,
        _partition_index: usize,
-        file_meta: FileMeta,
+        partitioned_file: PartitionedFile,
        _metadata_size_hint: Option<usize>,
        _metrics: &ExecutionPlanMetricsSet,
    ) -> DatafusionResult<Box<dyn AsyncFileReader + Send>> {
-        let path = file_meta.location().to_string();
+        let path = partitioned_file.path().to_string();
        let object_store = self.object_store.clone();

        Ok(Box::new(LazyParquetFileReader::new(object_store, path)))
@@ -180,7 +180,7 @@ impl DfRecordBatchEncoder for ArrowWriter<SharedBuffer> {

 #[async_trait]
 impl ArrowWriterCloser for ArrowWriter<SharedBuffer> {
-    async fn close(self) -> Result<FileMetaData> {
+    async fn close(self) -> Result<ParquetMetaData> {
        self.close().context(error::EncodeRecordBatchSnafu)
    }
 }
--- a/src/common/datasource/src/file_format/tests.rs
+++ b/src/common/datasource/src/file_format/tests.rs
@@ -67,14 +67,14 @@ impl Test<'_> {
 async fn test_json_opener() {
    let store = test_store("/");
    let schema = test_basic_schema();
-    let file_source = Arc::new(JsonSource::new()).with_batch_size(test_util::TEST_BATCH_SIZE);
+    let file_source = Arc::new(JsonSource::new(schema)).with_batch_size(test_util::TEST_BATCH_SIZE);

    let path = &find_workspace_path("/src/common/datasource/tests/json/basic.json")
        .display()
        .to_string();
    let tests = [
        Test {
-            config: scan_config(schema.clone(), None, path, file_source.clone()),
+            config: scan_config(None, path, file_source.clone()),
            file_source: file_source.clone(),
            expected: vec![
                "+-----+-------+",
@@ -87,7 +87,7 @@ async fn test_json_opener() {
            ],
        },
        Test {
-            config: scan_config(schema, Some(1), path, file_source.clone()),
+            config: scan_config(Some(1), path, file_source.clone()),
            file_source,
            expected: vec![
                "+-----+------+",
@@ -112,13 +112,11 @@ async fn test_csv_opener() {
        .display()
        .to_string();

-    let file_source = CsvSource::new(true, b',', b'"')
-        .with_batch_size(test_util::TEST_BATCH_SIZE)
-        .with_schema(schema.clone());
+    let file_source = CsvSource::new(schema).with_batch_size(test_util::TEST_BATCH_SIZE);

    let tests = [
        Test {
-            config: scan_config(schema.clone(), None, path, file_source.clone()),
+            config: scan_config(None, path, file_source.clone()),
            file_source: file_source.clone(),
            expected: vec![
                "+-----+-------+---------------------+----------+------------+",
@@ -131,7 +129,7 @@ async fn test_csv_opener() {
            ],
        },
        Test {
-            config: scan_config(schema, Some(1), path, file_source.clone()),
+            config: scan_config(Some(1), path, file_source.clone()),
            file_source,
            expected: vec![
                "+-----+------+---------------------+----------+------------+",
@@ -158,10 +156,10 @@ async fn test_parquet_exec() {
        .display()
        .to_string();

-    let parquet_source = ParquetSource::default()
+    let parquet_source = ParquetSource::new(schema)
        .with_parquet_file_reader_factory(Arc::new(DefaultParquetFileReaderFactory::new(store)));

-    let config = scan_config(schema, None, path, Arc::new(parquet_source));
+    let config = scan_config(None, path, Arc::new(parquet_source));
    let exec = DataSourceExec::from_data_source(config);
    let ctx = SessionContext::new();

@@ -197,11 +195,11 @@ async fn test_orc_opener() {

    let store = test_store("/");
    let schema = Arc::new(OrcFormat.infer_schema(&store, path).await.unwrap());
-    let file_source = Arc::new(OrcSource::default());
+    let file_source = Arc::new(OrcSource::new(schema.into()));

    let tests = [
        Test {
-            config: scan_config(schema.clone(), None, path, file_source.clone()),
+            config: scan_config(None, path, file_source.clone()),
            file_source: file_source.clone(),
            expected: vec![
                "+----------+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+",
@@ -216,7 +214,7 @@ async fn test_orc_opener() {
            ],
        },
        Test {
-            config: scan_config(schema.clone(), Some(1), path, file_source.clone()),
+            config: scan_config(Some(1), path, file_source.clone()),
            file_source,
            expected: vec![
                "+----------+-----+------+------------+---+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+-------------------------+-------------+",
--- a/src/common/datasource/src/test_util.rs
+++ b/src/common/datasource/src/test_util.rs
@@ -80,7 +80,6 @@ pub fn csv_basic_schema() -> SchemaRef {
 }

 pub(crate) fn scan_config(
-    file_schema: SchemaRef,
    limit: Option<usize>,
    filename: &str,
    file_source: Arc<dyn FileSource>,
@@ -89,7 +88,7 @@ pub(crate) fn scan_config(
    let filename = &filename.replace('\\', "/");
    let file_group = FileGroup::new(vec![PartitionedFile::new(filename.clone(), 4096)]);

-    FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema, file_source)
+    FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
        .with_file_group(file_group)
        .with_limit(limit)
        .build()
@@ -109,7 +108,7 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi

    let size = store.read(origin_path).await.unwrap().len();

-    let config = scan_config(schema, None, origin_path, Arc::new(JsonSource::new()));
+    let config = scan_config(None, origin_path, Arc::new(JsonSource::new(schema)));
    let stream = FileStream::new(
        &config,
        0,
@@ -151,10 +150,8 @@ pub async fn setup_stream_to_csv_test(

    let schema = csv_basic_schema();

-    let csv_source = CsvSource::new(true, b',', b'"')
-        .with_schema(schema.clone())
-        .with_batch_size(TEST_BATCH_SIZE);
-    let config = scan_config(schema, None, origin_path, csv_source.clone());
+    let csv_source = CsvSource::new(schema).with_batch_size(TEST_BATCH_SIZE);
+    let config = scan_config(None, origin_path, csv_source.clone());
    let size = store.read(origin_path).await.unwrap().len();

    let csv_opener = csv_source.create_file_opener(
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -17,6 +17,7 @@ ahash.workspace = true
 api.workspace = true
 arc-swap = "1.0"
 arrow.workspace = true
+arrow-cast.workspace = true
 arrow-schema.workspace = true
 async-trait.workspace = true
 bincode = "=1.3.3"
@@ -46,6 +47,7 @@ geohash = { version = "0.13", optional = true }
 h3o = { version = "0.6", optional = true }
 hyperloglogplus = "0.4"
 jsonb.workspace = true
+jsonpath-rust = "0.7.5"
 memchr = "2.7"
 mito-codec.workspace = true
 nalgebra.workspace = true
--- a/src/common/function/src/admin/flush_compact_region.rs
+++ b/src/common/function/src/admin/flush_compact_region.rs
@@ -104,7 +104,8 @@ mod tests {
                    assert!(matches!(f.signature(),
                                     datafusion_expr::Signature {
                                         type_signature: datafusion_expr::TypeSignature::Uniform(1, valid_types),
-                                         volatility: datafusion_expr::Volatility::Immutable
+                                         volatility: datafusion_expr::Volatility::Immutable,
+                                         ..
                                     } if valid_types == &ConcreteDataType::numerics().into_iter().map(|dt| { use datatypes::data_type::DataType; dt.as_arrow_type() }).collect::<Vec<_>>()));
                }

--- a/src/common/function/src/admin/flush_compact_table.rs
+++ b/src/common/function/src/admin/flush_compact_table.rs
@@ -331,7 +331,8 @@ mod tests {
                    assert!(matches!(f.signature(),
                                     datafusion_expr::Signature {
                                         type_signature: datafusion_expr::TypeSignature::Uniform(1, valid_types),
-                                         volatility: datafusion_expr::Volatility::Immutable
+                                         volatility: datafusion_expr::Volatility::Immutable,
+                                         ..
                                     } if valid_types == &vec![ArrowDataType::Utf8]));
                }

--- a/src/common/function/src/admin/migrate_region.rs
+++ b/src/common/function/src/admin/migrate_region.rs
@@ -145,7 +145,8 @@ mod tests {
        assert!(matches!(f.signature(),
                         datafusion_expr::Signature {
                             type_signature: datafusion_expr::TypeSignature::OneOf(sigs),
-                             volatility: datafusion_expr::Volatility::Immutable
+                             volatility: datafusion_expr::Volatility::Immutable,
+                             ..
                         } if sigs.len() == 2));
    }

--- a/src/common/function/src/aggrs/aggr_wrapper.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper.rs
@@ -341,6 +341,7 @@ impl AggregateUDFImpl for StateWrapper {
                name: acc_args.name,
                is_distinct: acc_args.is_distinct,
                exprs: acc_args.exprs,
+                expr_fields: acc_args.expr_fields,
            };
            self.inner.accumulator(acc_args)?
        };
--- a/src/common/function/src/aggrs/aggr_wrapper/tests.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper/tests.rs
@@ -650,7 +650,7 @@ async fn test_last_value_order_by_udaf() {
                        DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
                        true
                    ), // ordering field is added to state fields too
-                    Field::new("is_set", DataType::Boolean, true)
+                    Field::new("last_value[last_value_is_set]", DataType::Boolean, true)
                ]
                .into()
            ),
@@ -735,7 +735,7 @@ async fn test_last_value_order_by_udaf() {
                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
                    true,
                ),
-                Field::new("is_set", DataType::Boolean, true),
+                Field::new("last_value[last_value_is_set]", DataType::Boolean, true),
            ]
            .into(),
            vec![
--- a/src/common/function/src/scalars/date/date_add.rs
+++ b/src/common/function/src/scalars/date/date_add.rs
@@ -122,7 +122,8 @@ mod tests {
            matches!(f.signature(),
                         Signature {
                             type_signature: TypeSignature::OneOf(sigs),
-                             volatility: Volatility::Immutable
+                             volatility: Volatility::Immutable,
+                             ..
                         } if  sigs.len() == 15),
            "{:?}",
            f.signature()
--- a/src/common/function/src/scalars/date/date_format.rs
+++ b/src/common/function/src/scalars/date/date_format.rs
@@ -193,7 +193,8 @@ mod tests {
        assert!(matches!(f.signature(),
                         Signature {
                             type_signature: TypeSignature::OneOf(sigs),
-                             volatility: Volatility::Immutable
+                             volatility: Volatility::Immutable,
+                             ..
                         } if  sigs.len() == 6));
    }

--- a/src/common/function/src/scalars/date/date_sub.rs
+++ b/src/common/function/src/scalars/date/date_sub.rs
@@ -120,7 +120,8 @@ mod tests {
            matches!(f.signature(),
                         Signature {
                             type_signature: TypeSignature::OneOf(sigs),
-                             volatility: Volatility::Immutable
+                             volatility: Volatility::Immutable,
+                             ..
                         } if  sigs.len() == 15),
            "{:?}",
            f.signature()
--- a/src/common/function/src/scalars/json/json_get.rs
+++ b/src/common/function/src/scalars/json/json_get.rs
@@ -13,17 +13,23 @@
 // limitations under the License.

 use std::fmt::{self, Display};
+use std::str::FromStr;
 use std::sync::Arc;

+use arrow::array::{ArrayRef, BinaryViewArray, StringViewArray, StructArray};
 use arrow::compute;
-use datafusion_common::DataFusionError;
+use arrow::datatypes::{Float64Type, Int64Type, UInt64Type};
 use datafusion_common::arrow::array::{
    Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder,
    StringViewBuilder,
 };
 use datafusion_common::arrow::datatypes::DataType;
-use datafusion_expr::type_coercion::aggregates::STRINGS;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature};
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+use datatypes::arrow_array::{int_array_value_at_index, string_array_value_at_index};
+use datatypes::json::JsonStructureSettings;
+use jsonpath_rust::JsonPath;
+use serde_json::Value;

 use crate::function::{Function, extract_args};
 use crate::helper;
@@ -124,13 +130,6 @@ macro_rules! json_get {
    };
 }

-json_get!(
-    JsonGetInt,
-    Int64,
-    i64,
-    "Get the value from the JSONB by the given path and return it as an integer."
-);
-
 json_get!(
    JsonGetFloat,
    Float64,
@@ -145,70 +144,356 @@ json_get!(
    "Get the value from the JSONB by the given path and return it as a boolean."
 );

-/// Get the value from the JSONB by the given path and return it as a string.
-#[derive(Clone, Debug)]
-pub struct JsonGetString {
+enum JsonResultValue<'a> {
+    Jsonb(Vec<u8>),
+    JsonStructByColumn(&'a ArrayRef, usize),
+    JsonStructByValue(&'a Value),
+}
+
+trait JsonGetResultBuilder {
+    fn append_value(&mut self, value: JsonResultValue<'_>) -> Result<()>;
+
+    fn append_null(&mut self);
+
+    fn build(&mut self) -> ArrayRef;
+}
+
+/// Common implementation for JSON get scalar functions.
+///
+/// `JsonGet` encapsulates the logic for extracting values from JSON inputs
+/// based on a path expression. Different JSON get functions reuse this
+/// implementation by supplying their own `JsonGetResultBuilder` to control
+/// how the resulting values are materialized into an Arrow array.
+struct JsonGet {
    signature: Signature,
 }

-impl JsonGetString {
-    pub const NAME: &'static str = "json_get_string";
+impl JsonGet {
+    fn invoke<F, B>(&self, args: ScalarFunctionArgs, builder_factory: F) -> Result<ColumnarValue>
+    where
+        F: Fn(usize) -> B,
+        B: JsonGetResultBuilder,
+    {
+        let [arg0, arg1] = extract_args("JSON_GET", &args)?;
+
+        let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
+        let paths = arg1.as_string_view();
+
+        let mut builder = (builder_factory)(arg0.len());
+        match arg0.data_type() {
+            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
+                let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
+                let jsons = arg0.as_binary_view();
+                jsonb_get(jsons, paths, &mut builder)?;
+            }
+            DataType::Struct(_) => {
+                let jsons = arg0.as_struct();
+                json_struct_get(jsons, paths, &mut builder)?
+            }
+            _ => {
+                return Err(DataFusionError::Execution(format!(
+                    "JSON_GET not supported argument type {}",
+                    arg0.data_type(),
+                )));
+            }
+        };
+
+        Ok(ColumnarValue::Array(builder.build()))
+    }
 }

-impl Default for JsonGetString {
+impl Default for JsonGet {
    fn default() -> Self {
        Self {
-            // TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type.
-            signature: helper::one_of_sigs2(
-                vec![DataType::Binary, DataType::BinaryView],
-                vec![DataType::Utf8, DataType::Utf8View],
-            ),
+            signature: Signature::any(2, Volatility::Immutable),
        }
    }
 }

+#[derive(Default)]
+pub struct JsonGetString(JsonGet);
+
+impl JsonGetString {
+    pub const NAME: &'static str = "json_get_string";
+}
+
 impl Function for JsonGetString {
    fn name(&self) -> &str {
        Self::NAME
    }

-    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
        Ok(DataType::Utf8View)
    }

    fn signature(&self) -> &Signature {
-        &self.signature
+        &self.0.signature
    }

-    fn invoke_with_args(
-        &self,
-        args: ScalarFunctionArgs,
-    ) -> datafusion_common::Result<ColumnarValue> {
-        let [arg0, arg1] = extract_args(self.name(), &args)?;
-        let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
-        let jsons = arg0.as_binary_view();
-        let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
-        let paths = arg1.as_string_view();
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        struct StringResultBuilder(StringViewBuilder);

-        let size = jsons.len();
-        let mut builder = StringViewBuilder::with_capacity(size);
-
-        for i in 0..size {
-            let json = jsons.is_valid(i).then(|| jsons.value(i));
-            let path = paths.is_valid(i).then(|| paths.value(i));
-            let result = match (json, path) {
-                (Some(json), Some(path)) => {
-                    get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
+        impl JsonGetResultBuilder for StringResultBuilder {
+            fn append_value(&mut self, value: JsonResultValue<'_>) -> Result<()> {
+                match value {
+                    JsonResultValue::Jsonb(value) => {
+                        self.0.append_option(jsonb::to_str(&value).ok())
+                    }
+                    JsonResultValue::JsonStructByColumn(column, i) => {
+                        if let Some(v) = string_array_value_at_index(column, i) {
+                            self.0.append_value(v);
+                        } else {
+                            self.0
+                                .append_value(arrow_cast::display::array_value_to_string(
+                                    column, i,
+                                )?);
+                        }
+                    }
+                    JsonResultValue::JsonStructByValue(value) => {
+                        if let Some(s) = value.as_str() {
+                            self.0.append_value(s)
+                        } else {
+                            self.0.append_value(value.to_string())
+                        }
+                    }
                }
-                _ => None,
-            };
-            builder.append_option(result);
+                Ok(())
+            }
+
+            fn append_null(&mut self) {
+                self.0.append_null();
+            }
+
+            fn build(&mut self) -> ArrayRef {
+                Arc::new(self.0.finish())
+            }
        }

-        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+        self.0.invoke(args, |len: usize| {
+            StringResultBuilder(StringViewBuilder::with_capacity(len))
+        })
    }
 }

+#[derive(Default)]
+pub struct JsonGetInt(JsonGet);
+
+impl JsonGetInt {
+    pub const NAME: &'static str = "json_get_int";
+}
+
+impl Function for JsonGetInt {
+    fn name(&self) -> &str {
+        Self::NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int64)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.0.signature
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        struct IntResultBuilder(Int64Builder);
+
+        impl JsonGetResultBuilder for IntResultBuilder {
+            fn append_value(&mut self, value: JsonResultValue<'_>) -> Result<()> {
+                match value {
+                    JsonResultValue::Jsonb(value) => {
+                        self.0.append_option(jsonb::to_i64(&value).ok())
+                    }
+                    JsonResultValue::JsonStructByColumn(column, i) => {
+                        self.0.append_option(int_array_value_at_index(column, i))
+                    }
+                    JsonResultValue::JsonStructByValue(value) => {
+                        self.0.append_option(value.as_i64())
+                    }
+                }
+                Ok(())
+            }
+
+            fn append_null(&mut self) {
+                self.0.append_null();
+            }
+
+            fn build(&mut self) -> ArrayRef {
+                Arc::new(self.0.finish())
+            }
+        }
+
+        self.0.invoke(args, |len: usize| {
+            IntResultBuilder(Int64Builder::with_capacity(len))
+        })
+    }
+}
+
+impl Display for JsonGetInt {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", Self::NAME.to_ascii_uppercase())
+    }
+}
+
+fn jsonb_get(
+    jsons: &BinaryViewArray,
+    paths: &StringViewArray,
+    builder: &mut impl JsonGetResultBuilder,
+) -> Result<()> {
+    let size = jsons.len();
+    for i in 0..size {
+        let json = jsons.is_valid(i).then(|| jsons.value(i));
+        let path = paths.is_valid(i).then(|| paths.value(i));
+        let result = match (json, path) {
+            (Some(json), Some(path)) => get_json_by_path(json, path),
+            _ => None,
+        };
+        if let Some(v) = result {
+            builder.append_value(JsonResultValue::Jsonb(v))?;
+        } else {
+            builder.append_null();
+        }
+    }
+    Ok(())
+}
+
+fn json_struct_get(
+    jsons: &StructArray,
+    paths: &StringViewArray,
+    builder: &mut impl JsonGetResultBuilder,
+) -> Result<()> {
+    let size = jsons.len();
+    for i in 0..size {
+        if jsons.is_null(i) || paths.is_null(i) {
+            builder.append_null();
+            continue;
+        }
+        let path = paths.value(i);
+
+        // naively assume the JSON path is our kind of indexing to the field, by removing its "root"
+        let field_path = path.trim().replace("$.", "");
+        let column = jsons.column_by_name(&field_path);
+
+        if let Some(column) = column {
+            builder.append_value(JsonResultValue::JsonStructByColumn(column, i))?;
+        } else {
+            let Some(raw) = jsons
+                .column_by_name(JsonStructureSettings::RAW_FIELD)
+                .and_then(|x| string_array_value_at_index(x, i))
+            else {
+                builder.append_null();
+                continue;
+            };
+
+            let path: JsonPath<Value> = JsonPath::try_from(path).map_err(|e| {
+                DataFusionError::Execution(format!("{path} is not a valid JSON path: {e}"))
+            })?;
+            // the wanted field is not retrievable from the JSON struct columns directly, we have
+            // to combine everything (columns and the "_raw") into a complete JSON value to find it
+            let value = json_struct_to_value(raw, jsons, i)?;
+
+            match path.find(&value) {
+                Value::Null => builder.append_null(),
+                Value::Array(values) => match values.as_slice() {
+                    [] => builder.append_null(),
+                    [x] => builder.append_value(JsonResultValue::JsonStructByValue(x))?,
+                    _ => builder.append_value(JsonResultValue::JsonStructByValue(&value))?,
+                },
+                value => builder.append_value(JsonResultValue::JsonStructByValue(&value))?,
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn json_struct_to_value(raw: &str, jsons: &StructArray, i: usize) -> Result<Value> {
+    let Ok(mut json) = Value::from_str(raw) else {
+        return Err(DataFusionError::Internal(format!(
+            "inner field '{}' is not a valid JSON string",
+            JsonStructureSettings::RAW_FIELD
+        )));
+    };
+
+    for (column_name, column) in jsons.column_names().into_iter().zip(jsons.columns()) {
+        if column_name == JsonStructureSettings::RAW_FIELD {
+            continue;
+        }
+
+        let (json_pointer, field) = if let Some((json_object, field)) = column_name.rsplit_once(".")
+        {
+            let json_pointer = format!("/{}", json_object.replace(".", "/"));
+            (json_pointer, field)
+        } else {
+            ("".to_string(), column_name)
+        };
+        let Some(json_object) = json
+            .pointer_mut(&json_pointer)
+            .and_then(|x| x.as_object_mut())
+        else {
+            return Err(DataFusionError::Internal(format!(
+                "value at JSON pointer '{}' is not an object",
+                json_pointer
+            )));
+        };
+
+        macro_rules! insert {
+            ($column: ident, $i: ident, $json_object: ident, $field: ident) => {{
+                if let Some(value) = $column
+                    .is_valid($i)
+                    .then(|| serde_json::Value::from($column.value($i)))
+                {
+                    $json_object.insert($field.to_string(), value);
+                }
+            }};
+        }
+
+        match column.data_type() {
+            // boolean => Value::Bool
+            DataType::Boolean => {
+                let column = column.as_boolean();
+                insert!(column, i, json_object, field);
+            }
+            // int => Value::Number
+            DataType::Int64 => {
+                let column = column.as_primitive::<Int64Type>();
+                insert!(column, i, json_object, field);
+            }
+            DataType::UInt64 => {
+                let column = column.as_primitive::<UInt64Type>();
+                insert!(column, i, json_object, field);
+            }
+            DataType::Float64 => {
+                let column = column.as_primitive::<Float64Type>();
+                insert!(column, i, json_object, field);
+            }
+            // string => Value::String
+            DataType::Utf8 => {
+                let column = column.as_string::<i32>();
+                insert!(column, i, json_object, field);
+            }
+            DataType::LargeUtf8 => {
+                let column = column.as_string::<i64>();
+                insert!(column, i, json_object, field);
+            }
+            DataType::Utf8View => {
+                let column = column.as_string_view();
+                insert!(column, i, json_object, field);
+            }
+            // other => Value::Array and Value::Object
+            _ => {
+                return Err(DataFusionError::NotImplemented(format!(
+                    "{} is not yet supported to be executed with field {} of datatype {}",
+                    JsonGetString::NAME,
+                    column_name,
+                    column.data_type()
+                )));
+            }
+        }
+    }
+    Ok(json)
+}
+
 impl Display for JsonGetString {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}", Self::NAME.to_ascii_uppercase())
@@ -233,7 +518,7 @@ impl Default for JsonGetObject {
                    DataType::LargeBinary,
                    DataType::BinaryView,
                ],
-                STRINGS.to_vec(),
+                vec![DataType::UInt8, DataType::LargeUtf8, DataType::Utf8View],
            ),
        }
    }
@@ -296,14 +581,60 @@ impl Display for JsonGetObject {
 mod tests {
    use std::sync::Arc;

+    use arrow::array::{Float64Array, Int64Array, StructArray};
    use arrow_schema::Field;
    use datafusion_common::ScalarValue;
    use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray};
    use datafusion_common::arrow::datatypes::{Float64Type, Int64Type};
    use datatypes::types::parse_string_to_jsonb;
+    use serde_json::json;

    use super::*;

+    /// Create a JSON object like this (as a one element struct array for testing):
+    ///
+    /// ```JSON
+    /// {
+    ///     "kind": "foo",
+    ///     "payload": {
+    ///         "code": 404,
+    ///         "success": false,
+    ///         "result": {
+    ///             "error": "not found",
+    ///             "time_cost": 1.234
+    ///         }
+    ///     }
+    /// }
+    /// ```
+    fn test_json_struct() -> ArrayRef {
+        Arc::new(StructArray::new(
+            vec![
+                Field::new("kind", DataType::Utf8, true),
+                Field::new("payload.code", DataType::Int64, true),
+                Field::new("payload.result.time_cost", DataType::Float64, true),
+                Field::new(JsonStructureSettings::RAW_FIELD, DataType::Utf8View, true),
+            ]
+            .into(),
+            vec![
+                Arc::new(StringArray::from_iter([Some("foo")])) as ArrayRef,
+                Arc::new(Int64Array::from_iter([Some(404)])),
+                Arc::new(Float64Array::from_iter([Some(1.234)])),
+                Arc::new(StringViewArray::from_iter([Some(
+                    json! ({
+                        "payload": {
+                            "success": false,
+                            "result": {
+                                "error": "not found"
+                            }
+                        }
+                    })
+                    .to_string(),
+                )])),
+            ],
+            None,
+        ))
+    }
+
    #[test]
    fn test_json_get_int() {
        let json_get_int = JsonGetInt::default();
@@ -321,37 +652,55 @@ mod tests {
            r#"{"a": 4, "b": {"c": 6}, "c": 6}"#,
            r#"{"a": 7, "b": 8, "c": {"a": 7}}"#,
        ];
-        let paths = vec!["$.a.b", "$.a", "$.c"];
-        let results = [Some(2), Some(4), None];
+        let json_struct = test_json_struct();

-        let jsonbs = json_strings
+        let path_expects = vec![
+            ("$.a.b", Some(2)),
+            ("$.a", Some(4)),
+            ("$.c", None),
+            ("$.kind", None),
+            ("$.payload.code", Some(404)),
+            ("$.payload.success", None),
+            ("$.payload.result.time_cost", None),
+            ("$.payload.not-exists", None),
+            ("$.not-exists", None),
+            ("$", None),
+        ];
+
+        let mut jsons = json_strings
            .iter()
            .map(|s| {
                let value = jsonb::parse_value(s.as_bytes()).unwrap();
-                value.to_vec()
+                Arc::new(BinaryArray::from_iter_values([value.to_vec()])) as ArrayRef
            })
            .collect::<Vec<_>>();
+        let json_struct_arrays =
+            std::iter::repeat_n(json_struct, path_expects.len() - jsons.len()).collect::<Vec<_>>();
+        jsons.extend(json_struct_arrays);

-        let args = ScalarFunctionArgs {
-            args: vec![
-                ColumnarValue::Array(Arc::new(BinaryArray::from_iter_values(jsonbs))),
-                ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))),
-            ],
-            arg_fields: vec![],
-            number_rows: 3,
-            return_field: Arc::new(Field::new("x", DataType::Int64, false)),
-            config_options: Arc::new(Default::default()),
-        };
-        let result = json_get_int
-            .invoke_with_args(args)
-            .and_then(|x| x.to_array(3))
-            .unwrap();
-        let vector = result.as_primitive::<Int64Type>();
+        for i in 0..jsons.len() {
+            let json = &jsons[i];
+            let (path, expect) = path_expects[i];

-        assert_eq!(3, vector.len());
-        for (i, gt) in results.iter().enumerate() {
-            let result = vector.is_valid(i).then(|| vector.value(i));
-            assert_eq!(*gt, result);
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(json.clone()),
+                    ColumnarValue::Scalar(path.into()),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("x", DataType::Int64, false)),
+                config_options: Arc::new(Default::default()),
+            };
+            let result = json_get_int
+                .invoke_with_args(args)
+                .and_then(|x| x.to_array(1))
+                .unwrap();
+
+            let result = result.as_primitive::<Int64Type>();
+            assert_eq!(1, result.len());
+            let actual = result.is_valid(0).then(|| result.value(0));
+            assert_eq!(actual, expect);
        }
    }

@@ -474,42 +823,85 @@ mod tests {
            r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#,
            r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#,
        ];
-        let paths = vec!["$.a.b", "$.a", ""];
-        let results = [Some("a"), Some("d"), None];
+        let json_struct = test_json_struct();

-        let jsonbs = json_strings
+        let paths = vec![
+            "$.a.b",
+            "$.a",
+            "",
+            "$.kind",
+            "$.payload.code",
+            "$.payload.result.time_cost",
+            "$.payload",
+            "$.payload.success",
+            "$.payload.result",
+            "$.payload.result.error",
+            "$.payload.result.not-exists",
+            "$.payload.not-exists",
+            "$.not-exists",
+            "$",
+        ];
+        let expects = [
+            Some("a"),
+            Some("d"),
+            None,
+            Some("foo"),
+            Some("404"),
+            Some("1.234"),
+            Some(
+                r#"{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}"#,
+            ),
+            Some("false"),
+            Some(r#"{"error":"not found","time_cost":1.234}"#),
+            Some("not found"),
+            None,
+            None,
+            None,
+            Some(
+                r#"{"kind":"foo","payload":{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}}"#,
+            ),
+        ];
+
+        let mut jsons = json_strings
            .iter()
            .map(|s| {
                let value = jsonb::parse_value(s.as_bytes()).unwrap();
-                value.to_vec()
+                Arc::new(BinaryArray::from_iter_values([value.to_vec()])) as ArrayRef
            })
            .collect::<Vec<_>>();
+        let json_struct_arrays =
+            std::iter::repeat_n(json_struct, expects.len() - jsons.len()).collect::<Vec<_>>();
+        jsons.extend(json_struct_arrays);

-        let args = ScalarFunctionArgs {
-            args: vec![
-                ColumnarValue::Array(Arc::new(BinaryArray::from_iter_values(jsonbs))),
-                ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))),
-            ],
-            arg_fields: vec![],
-            number_rows: 3,
-            return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
-            config_options: Arc::new(Default::default()),
-        };
-        let result = json_get_string
-            .invoke_with_args(args)
-            .and_then(|x| x.to_array(3))
-            .unwrap();
-        let vector = result.as_string_view();
+        for i in 0..jsons.len() {
+            let json = &jsons[i];
+            let path = paths[i];
+            let expect = expects[i];

-        assert_eq!(3, vector.len());
-        for (i, gt) in results.iter().enumerate() {
-            let result = vector.is_valid(i).then(|| vector.value(i));
-            assert_eq!(*gt, result);
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(json.clone()),
+                    ColumnarValue::Scalar(path.into()),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
+                config_options: Arc::new(Default::default()),
+            };
+            let result = json_get_string
+                .invoke_with_args(args)
+                .and_then(|x| x.to_array(1))
+                .unwrap();
+
+            let result = result.as_string_view();
+            assert_eq!(1, result.len());
+            let actual = result.is_valid(0).then(|| result.value(0));
+            assert_eq!(actual, expect);
        }
    }

    #[test]
-    fn test_json_get_object() -> datafusion_common::Result<()> {
+    fn test_json_get_object() -> Result<()> {
        let udf = JsonGetObject::default();
        assert_eq!("json_get_object", udf.name());
        assert_eq!(
--- a/src/common/function/src/scalars/math/rate.rs
+++ b/src/common/function/src/scalars/math/rate.rs
@@ -99,7 +99,8 @@ mod tests {
        assert!(matches!(rate.signature(),
                         Signature {
                             type_signature: TypeSignature::Uniform(2, valid_types),
-                             volatility: Volatility::Immutable
+                             volatility: Volatility::Immutable,
+                             ..
                         } if  valid_types == NUMERICS
        ));
        let values = vec![1.0, 3.0, 6.0];
--- a/src/common/function/src/scalars/string.rs
+++ b/src/common/function/src/scalars/string.rs
@@ -14,13 +14,31 @@

 //! String scalar functions

+mod elt;
+mod field;
+mod format;
+mod insert;
+mod locate;
 mod regexp_extract;
+mod space;

+pub(crate) use elt::EltFunction;
+pub(crate) use field::FieldFunction;
+pub(crate) use format::FormatFunction;
+pub(crate) use insert::InsertFunction;
+pub(crate) use locate::LocateFunction;
 pub(crate) use regexp_extract::RegexpExtractFunction;
+pub(crate) use space::SpaceFunction;

 use crate::function_registry::FunctionRegistry;

 /// Register all string functions
 pub fn register_string_functions(registry: &FunctionRegistry) {
+    EltFunction::register(registry);
+    FieldFunction::register(registry);
+    FormatFunction::register(registry);
+    InsertFunction::register(registry);
+    LocateFunction::register(registry);
    RegexpExtractFunction::register(registry);
+    SpaceFunction::register(registry);
 }
--- a/src/common/function/src/scalars/string/elt.rs
+++ b/src/common/function/src/scalars/string/elt.rs
@@ -0,0 +1,252 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MySQL-compatible ELT function implementation.
+//!
+//! ELT(N, str1, str2, str3, ...) - Returns the Nth string from the list.
+//! Returns NULL if N < 1 or N > number of strings.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion_common::DataFusionError;
+use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, LargeStringBuilder};
+use datafusion_common::arrow::compute::cast;
+use datafusion_common::arrow::datatypes::DataType;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+
+use crate::function::Function;
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "elt";
+
+/// MySQL-compatible ELT function.
+///
+/// Syntax: ELT(N, str1, str2, str3, ...)
+/// Returns the Nth string argument. N is 1-based.
+/// Returns NULL if N is NULL, N < 1, or N > number of string arguments.
+#[derive(Debug)]
+pub struct EltFunction {
+    signature: Signature,
+}
+
+impl EltFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register_scalar(EltFunction::default());
+    }
+}
+
+impl Default for EltFunction {
+    fn default() -> Self {
+        Self {
+            // ELT takes a variable number of arguments: (Int64, String, String, ...)
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl fmt::Display for EltFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for EltFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::LargeUtf8)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        if args.args.len() < 2 {
+            return Err(DataFusionError::Execution(
+                "ELT requires at least 2 arguments: ELT(N, str1, ...)".to_string(),
+            ));
+        }
+
+        let arrays = ColumnarValue::values_to_arrays(&args.args)?;
+        let len = arrays[0].len();
+        let num_strings = arrays.len() - 1;
+
+        // First argument is the index (N) - try to cast to Int64
+        let index_array = if arrays[0].data_type() == &DataType::Null {
+            // All NULLs - return all NULLs
+            let mut builder = LargeStringBuilder::with_capacity(len, 0);
+            for _ in 0..len {
+                builder.append_null();
+            }
+            return Ok(ColumnarValue::Array(Arc::new(builder.finish())));
+        } else {
+            cast(arrays[0].as_ref(), &DataType::Int64).map_err(|e| {
+                DataFusionError::Execution(format!("ELT: index argument cast failed: {}", e))
+            })?
+        };
+
+        // Cast string arguments to LargeUtf8
+        let string_arrays: Vec<ArrayRef> = arrays[1..]
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                cast(arr.as_ref(), &DataType::LargeUtf8).map_err(|e| {
+                    DataFusionError::Execution(format!(
+                        "ELT: string argument {} cast failed: {}",
+                        i + 1,
+                        e
+                    ))
+                })
+            })
+            .collect::<datafusion_common::Result<Vec<_>>>()?;
+
+        let mut builder = LargeStringBuilder::with_capacity(len, len * 32);
+
+        for i in 0..len {
+            if index_array.is_null(i) {
+                builder.append_null();
+                continue;
+            }
+
+            let n = index_array
+                .as_primitive::<datafusion_common::arrow::datatypes::Int64Type>()
+                .value(i);
+
+            // N is 1-based, check bounds
+            if n < 1 || n as usize > num_strings {
+                builder.append_null();
+                continue;
+            }
+
+            let str_idx = (n - 1) as usize;
+            let str_array = string_arrays[str_idx].as_string::<i64>();
+
+            if str_array.is_null(i) {
+                builder.append_null();
+            } else {
+                builder.append_value(str_array.value(i));
+            }
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::arrow::array::{Int64Array, StringArray};
+    use datafusion_common::arrow::datatypes::Field;
+    use datafusion_expr::ScalarFunctionArgs;
+
+    use super::*;
+
+    fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
+        let arg_fields: Vec<_> = arrays
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                Arc::new(Field::new(
+                    format!("arg_{}", i),
+                    arr.data_type().clone(),
+                    true,
+                ))
+            })
+            .collect();
+
+        ScalarFunctionArgs {
+            args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
+            arg_fields,
+            return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
+            number_rows: arrays[0].len(),
+            config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
+        }
+    }
+
+    #[test]
+    fn test_elt_basic() {
+        let function = EltFunction::default();
+
+        let n = Arc::new(Int64Array::from(vec![1, 2, 3]));
+        let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
+        let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
+        let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
+
+        let args = create_args(vec![n, s1, s2, s3]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "a");
+            assert_eq!(str_array.value(1), "b");
+            assert_eq!(str_array.value(2), "c");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_elt_out_of_bounds() {
+        let function = EltFunction::default();
+
+        let n = Arc::new(Int64Array::from(vec![0, 4, -1]));
+        let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
+        let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
+        let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
+
+        let args = create_args(vec![n, s1, s2, s3]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert!(str_array.is_null(0)); // 0 is out of bounds
+            assert!(str_array.is_null(1)); // 4 is out of bounds
+            assert!(str_array.is_null(2)); // -1 is out of bounds
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_elt_with_nulls() {
+        let function = EltFunction::default();
+
+        // Row 0: n=1, select s1="a" -> "a"
+        // Row 1: n=NULL -> NULL
+        // Row 2: n=1, select s1=NULL -> NULL
+        let n = Arc::new(Int64Array::from(vec![Some(1), None, Some(1)]));
+        let s1 = Arc::new(StringArray::from(vec![Some("a"), Some("a"), None]));
+        let s2 = Arc::new(StringArray::from(vec![Some("b"), Some("b"), Some("b")]));
+
+        let args = create_args(vec![n, s1, s2]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "a");
+            assert!(str_array.is_null(1)); // N is NULL
+            assert!(str_array.is_null(2)); // Selected string is NULL
+        } else {
+            panic!("Expected array result");
+        }
+    }
+}
--- a/src/common/function/src/scalars/string/field.rs
+++ b/src/common/function/src/scalars/string/field.rs
@@ -0,0 +1,224 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MySQL-compatible FIELD function implementation.
+//!
+//! FIELD(str, str1, str2, str3, ...) - Returns the 1-based index of str in the list.
+//! Returns 0 if str is not found or is NULL.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion_common::DataFusionError;
+use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, Int64Builder};
+use datafusion_common::arrow::compute::cast;
+use datafusion_common::arrow::datatypes::DataType;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+
+use crate::function::Function;
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "field";
+
+/// MySQL-compatible FIELD function.
+///
+/// Syntax: FIELD(str, str1, str2, str3, ...)
+/// Returns the 1-based index of str in the argument list (str1, str2, str3, ...).
+/// Returns 0 if str is not found or is NULL.
+#[derive(Debug)]
+pub struct FieldFunction {
+    signature: Signature,
+}
+
+impl FieldFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register_scalar(FieldFunction::default());
+    }
+}
+
+impl Default for FieldFunction {
+    fn default() -> Self {
+        Self {
+            // FIELD takes a variable number of arguments: (String, String, String, ...)
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl fmt::Display for FieldFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for FieldFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::Int64)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        if args.args.len() < 2 {
+            return Err(DataFusionError::Execution(
+                "FIELD requires at least 2 arguments: FIELD(str, str1, ...)".to_string(),
+            ));
+        }
+
+        let arrays = ColumnarValue::values_to_arrays(&args.args)?;
+        let len = arrays[0].len();
+
+        // Cast all arguments to LargeUtf8
+        let string_arrays: Vec<ArrayRef> = arrays
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                cast(arr.as_ref(), &DataType::LargeUtf8).map_err(|e| {
+                    DataFusionError::Execution(format!("FIELD: argument {} cast failed: {}", i, e))
+                })
+            })
+            .collect::<datafusion_common::Result<Vec<_>>>()?;
+
+        let search_str = string_arrays[0].as_string::<i64>();
+        let mut builder = Int64Builder::with_capacity(len);
+
+        for i in 0..len {
+            // If search string is NULL, return 0
+            if search_str.is_null(i) {
+                builder.append_value(0);
+                continue;
+            }
+
+            let needle = search_str.value(i);
+            let mut found_idx = 0i64;
+
+            // Search through the list (starting from index 1 in string_arrays)
+            for (j, str_arr) in string_arrays[1..].iter().enumerate() {
+                let str_array = str_arr.as_string::<i64>();
+                if !str_array.is_null(i) && str_array.value(i) == needle {
+                    found_idx = (j + 1) as i64; // 1-based index
+                    break;
+                }
+            }
+
+            builder.append_value(found_idx);
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::arrow::array::StringArray;
+    use datafusion_common::arrow::datatypes::Field;
+    use datafusion_expr::ScalarFunctionArgs;
+
+    use super::*;
+
+    fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
+        let arg_fields: Vec<_> = arrays
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                Arc::new(Field::new(
+                    format!("arg_{}", i),
+                    arr.data_type().clone(),
+                    true,
+                ))
+            })
+            .collect();
+
+        ScalarFunctionArgs {
+            args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
+            arg_fields,
+            return_field: Arc::new(Field::new("result", DataType::Int64, true)),
+            number_rows: arrays[0].len(),
+            config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
+        }
+    }
+
+    #[test]
+    fn test_field_basic() {
+        let function = FieldFunction::default();
+
+        let search = Arc::new(StringArray::from(vec!["b", "d", "a"]));
+        let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
+        let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
+        let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
+
+        let args = create_args(vec![search, s1, s2, s3]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 2); // "b" is at index 2
+            assert_eq!(int_array.value(1), 0); // "d" not found
+            assert_eq!(int_array.value(2), 1); // "a" is at index 1
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_field_with_null_search() {
+        let function = FieldFunction::default();
+
+        let search = Arc::new(StringArray::from(vec![Some("a"), None]));
+        let s1 = Arc::new(StringArray::from(vec!["a", "a"]));
+        let s2 = Arc::new(StringArray::from(vec!["b", "b"]));
+
+        let args = create_args(vec![search, s1, s2]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 1); // "a" found at index 1
+            assert_eq!(int_array.value(1), 0); // NULL search returns 0
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_field_case_sensitive() {
+        let function = FieldFunction::default();
+
+        let search = Arc::new(StringArray::from(vec!["A", "a"]));
+        let s1 = Arc::new(StringArray::from(vec!["a", "a"]));
+        let s2 = Arc::new(StringArray::from(vec!["A", "A"]));
+
+        let args = create_args(vec![search, s1, s2]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 2); // "A" matches at index 2
+            assert_eq!(int_array.value(1), 1); // "a" matches at index 1
+        } else {
+            panic!("Expected array result");
+        }
+    }
+}
--- a/src/common/function/src/scalars/string/format.rs
+++ b/src/common/function/src/scalars/string/format.rs
@@ -0,0 +1,512 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MySQL-compatible FORMAT function implementation.
+//!
+//! FORMAT(X, D) - Formats the number X with D decimal places using thousand separators.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion_common::DataFusionError;
+use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
+use datafusion_common::arrow::datatypes as arrow_types;
+use datafusion_common::arrow::datatypes::DataType;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
+
+use crate::function::Function;
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "format";
+
+/// MySQL-compatible FORMAT function.
+///
+/// Syntax: FORMAT(X, D)
+/// Formats the number X to a format like '#,###,###.##', rounded to D decimal places.
+/// D can be 0 to 30.
+///
+/// Note: This implementation uses the en_US locale (comma as thousand separator,
+/// period as decimal separator).
+#[derive(Debug)]
+pub struct FormatFunction {
+    signature: Signature,
+}
+
+impl FormatFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register_scalar(FormatFunction::default());
+    }
+}
+
+impl Default for FormatFunction {
+    fn default() -> Self {
+        let mut signatures = Vec::new();
+
+        // Support various numeric types for X
+        let numeric_types = [
+            DataType::Float64,
+            DataType::Float32,
+            DataType::Int64,
+            DataType::Int32,
+            DataType::Int16,
+            DataType::Int8,
+            DataType::UInt64,
+            DataType::UInt32,
+            DataType::UInt16,
+            DataType::UInt8,
+        ];
+
+        // D can be various integer types
+        let int_types = [
+            DataType::Int64,
+            DataType::Int32,
+            DataType::Int16,
+            DataType::Int8,
+            DataType::UInt64,
+            DataType::UInt32,
+            DataType::UInt16,
+            DataType::UInt8,
+        ];
+
+        for x_type in &numeric_types {
+            for d_type in &int_types {
+                signatures.push(TypeSignature::Exact(vec![x_type.clone(), d_type.clone()]));
+            }
+        }
+
+        Self {
+            signature: Signature::one_of(signatures, Volatility::Immutable),
+        }
+    }
+}
+
+impl fmt::Display for FormatFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for FormatFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::LargeUtf8)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        if args.args.len() != 2 {
+            return Err(DataFusionError::Execution(
+                "FORMAT requires exactly 2 arguments: FORMAT(X, D)".to_string(),
+            ));
+        }
+
+        let arrays = ColumnarValue::values_to_arrays(&args.args)?;
+        let len = arrays[0].len();
+
+        let x_array = &arrays[0];
+        let d_array = &arrays[1];
+
+        let mut builder = LargeStringBuilder::with_capacity(len, len * 20);
+
+        for i in 0..len {
+            if x_array.is_null(i) || d_array.is_null(i) {
+                builder.append_null();
+                continue;
+            }
+
+            let decimal_places = get_decimal_places(d_array, i)?.clamp(0, 30) as usize;
+
+            let formatted = match x_array.data_type() {
+                DataType::Float64 | DataType::Float32 => {
+                    format_number_float(get_float_value(x_array, i)?, decimal_places)
+                }
+                DataType::Int64
+                | DataType::Int32
+                | DataType::Int16
+                | DataType::Int8
+                | DataType::UInt64
+                | DataType::UInt32
+                | DataType::UInt16
+                | DataType::UInt8 => format_number_integer(x_array, i, decimal_places)?,
+                _ => {
+                    return Err(DataFusionError::Execution(format!(
+                        "FORMAT: unsupported type {:?}",
+                        x_array.data_type()
+                    )));
+                }
+            };
+            builder.append_value(&formatted);
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+/// Get float value from various numeric types.
+fn get_float_value(
+    array: &datafusion_common::arrow::array::ArrayRef,
+    index: usize,
+) -> datafusion_common::Result<f64> {
+    match array.data_type() {
+        DataType::Float64 => Ok(array
+            .as_primitive::<arrow_types::Float64Type>()
+            .value(index)),
+        DataType::Float32 => Ok(array
+            .as_primitive::<arrow_types::Float32Type>()
+            .value(index) as f64),
+        _ => Err(DataFusionError::Execution(format!(
+            "FORMAT: unsupported type {:?}",
+            array.data_type()
+        ))),
+    }
+}
+
+/// Get decimal places from various integer types.
+///
+/// MySQL clamps decimal places to `0..=30`. This function returns an `i64` so the caller can clamp.
+fn get_decimal_places(
+    array: &datafusion_common::arrow::array::ArrayRef,
+    index: usize,
+) -> datafusion_common::Result<i64> {
+    match array.data_type() {
+        DataType::Int64 => Ok(array.as_primitive::<arrow_types::Int64Type>().value(index)),
+        DataType::Int32 => Ok(array.as_primitive::<arrow_types::Int32Type>().value(index) as i64),
+        DataType::Int16 => Ok(array.as_primitive::<arrow_types::Int16Type>().value(index) as i64),
+        DataType::Int8 => Ok(array.as_primitive::<arrow_types::Int8Type>().value(index) as i64),
+        DataType::UInt64 => {
+            let v = array.as_primitive::<arrow_types::UInt64Type>().value(index);
+            Ok(if v > i64::MAX as u64 {
+                i64::MAX
+            } else {
+                v as i64
+            })
+        }
+        DataType::UInt32 => Ok(array.as_primitive::<arrow_types::UInt32Type>().value(index) as i64),
+        DataType::UInt16 => Ok(array.as_primitive::<arrow_types::UInt16Type>().value(index) as i64),
+        DataType::UInt8 => Ok(array.as_primitive::<arrow_types::UInt8Type>().value(index) as i64),
+        _ => Err(DataFusionError::Execution(format!(
+            "FORMAT: unsupported type {:?}",
+            array.data_type()
+        ))),
+    }
+}
+
+fn format_number_integer(
+    array: &datafusion_common::arrow::array::ArrayRef,
+    index: usize,
+    decimal_places: usize,
+) -> datafusion_common::Result<String> {
+    let (is_negative, abs_digits) = match array.data_type() {
+        DataType::Int64 => {
+            let v = array.as_primitive::<arrow_types::Int64Type>().value(index) as i128;
+            (v.is_negative(), v.unsigned_abs().to_string())
+        }
+        DataType::Int32 => {
+            let v = array.as_primitive::<arrow_types::Int32Type>().value(index) as i128;
+            (v.is_negative(), v.unsigned_abs().to_string())
+        }
+        DataType::Int16 => {
+            let v = array.as_primitive::<arrow_types::Int16Type>().value(index) as i128;
+            (v.is_negative(), v.unsigned_abs().to_string())
+        }
+        DataType::Int8 => {
+            let v = array.as_primitive::<arrow_types::Int8Type>().value(index) as i128;
+            (v.is_negative(), v.unsigned_abs().to_string())
+        }
+        DataType::UInt64 => {
+            let v = array.as_primitive::<arrow_types::UInt64Type>().value(index) as u128;
+            (false, v.to_string())
+        }
+        DataType::UInt32 => {
+            let v = array.as_primitive::<arrow_types::UInt32Type>().value(index) as u128;
+            (false, v.to_string())
+        }
+        DataType::UInt16 => {
+            let v = array.as_primitive::<arrow_types::UInt16Type>().value(index) as u128;
+            (false, v.to_string())
+        }
+        DataType::UInt8 => {
+            let v = array.as_primitive::<arrow_types::UInt8Type>().value(index) as u128;
+            (false, v.to_string())
+        }
+        _ => {
+            return Err(DataFusionError::Execution(format!(
+                "FORMAT: unsupported type {:?}",
+                array.data_type()
+            )));
+        }
+    };
+
+    let mut result = String::new();
+    if is_negative {
+        result.push('-');
+    }
+    result.push_str(&add_thousand_separators(&abs_digits));
+
+    if decimal_places > 0 {
+        result.push('.');
+        result.push_str(&"0".repeat(decimal_places));
+    }
+
+    Ok(result)
+}
+
+/// Format a float with thousand separators and `decimal_places` digits after decimal point.
+fn format_number_float(x: f64, decimal_places: usize) -> String {
+    // Handle special cases
+    if x.is_nan() {
+        return "NaN".to_string();
+    }
+    if x.is_infinite() {
+        return if x.is_sign_positive() {
+            "Infinity".to_string()
+        } else {
+            "-Infinity".to_string()
+        };
+    }
+
+    // Round to decimal_places
+    let multiplier = 10f64.powi(decimal_places as i32);
+    let rounded = (x * multiplier).round() / multiplier;
+
+    // Split into integer and fractional parts
+    let is_negative = rounded < 0.0;
+    let abs_value = rounded.abs();
+
+    // Format with the specified decimal places
+    let formatted = if decimal_places == 0 {
+        format!("{:.0}", abs_value)
+    } else {
+        format!("{:.prec$}", abs_value, prec = decimal_places)
+    };
+
+    // Split at decimal point
+    let parts: Vec<&str> = formatted.split('.').collect();
+    let int_part = parts[0];
+    let dec_part = parts.get(1).copied();
+
+    // Add thousand separators to integer part
+    let int_with_sep = add_thousand_separators(int_part);
+
+    // Build result
+    let mut result = String::new();
+    if is_negative {
+        result.push('-');
+    }
+    result.push_str(&int_with_sep);
+    if let Some(dec) = dec_part {
+        result.push('.');
+        result.push_str(dec);
+    }
+
+    result
+}
+
+/// Add thousand separators (commas) to an integer string.
+fn add_thousand_separators(s: &str) -> String {
+    let chars: Vec<char> = s.chars().collect();
+    let len = chars.len();
+
+    if len <= 3 {
+        return s.to_string();
+    }
+
+    let mut result = String::with_capacity(len + len / 3);
+    let first_group_len = len % 3;
+    let first_group_len = if first_group_len == 0 {
+        3
+    } else {
+        first_group_len
+    };
+
+    for (i, ch) in chars.iter().enumerate() {
+        if i > 0 && i >= first_group_len && (i - first_group_len) % 3 == 0 {
+            result.push(',');
+        }
+        result.push(*ch);
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::arrow::array::{Float64Array, Int64Array};
+    use datafusion_common::arrow::datatypes::Field;
+    use datafusion_expr::ScalarFunctionArgs;
+
+    use super::*;
+
+    fn create_args(arrays: Vec<datafusion_common::arrow::array::ArrayRef>) -> ScalarFunctionArgs {
+        let arg_fields: Vec<_> = arrays
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                Arc::new(Field::new(
+                    format!("arg_{}", i),
+                    arr.data_type().clone(),
+                    true,
+                ))
+            })
+            .collect();
+
+        ScalarFunctionArgs {
+            args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
+            arg_fields,
+            return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
+            number_rows: arrays[0].len(),
+            config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
+        }
+    }
+
+    #[test]
+    fn test_format_basic() {
+        let function = FormatFunction::default();
+
+        let x = Arc::new(Float64Array::from(vec![1234567.891, 1234.5, 1234567.0]));
+        let d = Arc::new(Int64Array::from(vec![2, 0, 3]));
+
+        let args = create_args(vec![x, d]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "1,234,567.89");
+            assert_eq!(str_array.value(1), "1,235"); // rounded
+            assert_eq!(str_array.value(2), "1,234,567.000");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_format_negative() {
+        let function = FormatFunction::default();
+
+        let x = Arc::new(Float64Array::from(vec![-1234567.891]));
+        let d = Arc::new(Int64Array::from(vec![2]));
+
+        let args = create_args(vec![x, d]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "-1,234,567.89");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_format_small_numbers() {
+        let function = FormatFunction::default();
+
+        let x = Arc::new(Float64Array::from(vec![0.5, 12.345, 123.0]));
+        let d = Arc::new(Int64Array::from(vec![2, 2, 0]));
+
+        let args = create_args(vec![x, d]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "0.50");
+            assert_eq!(str_array.value(1), "12.35"); // rounded
+            assert_eq!(str_array.value(2), "123");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_format_with_nulls() {
+        let function = FormatFunction::default();
+
+        let x = Arc::new(Float64Array::from(vec![Some(1234.5), None]));
+        let d = Arc::new(Int64Array::from(vec![2, 2]));
+
+        let args = create_args(vec![x, d]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "1,234.50");
+            assert!(str_array.is_null(1));
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_add_thousand_separators() {
+        assert_eq!(add_thousand_separators("1"), "1");
+        assert_eq!(add_thousand_separators("12"), "12");
+        assert_eq!(add_thousand_separators("123"), "123");
+        assert_eq!(add_thousand_separators("1234"), "1,234");
+        assert_eq!(add_thousand_separators("12345"), "12,345");
+        assert_eq!(add_thousand_separators("123456"), "123,456");
+        assert_eq!(add_thousand_separators("1234567"), "1,234,567");
+        assert_eq!(add_thousand_separators("12345678"), "12,345,678");
+        assert_eq!(add_thousand_separators("123456789"), "123,456,789");
+    }
+
+    #[test]
+    fn test_format_large_int_no_float_precision_loss() {
+        let function = FormatFunction::default();
+
+        // 2^53 + 1 cannot be represented exactly as f64.
+        let x = Arc::new(Int64Array::from(vec![9_007_199_254_740_993i64]));
+        let d = Arc::new(Int64Array::from(vec![0]));
+
+        let args = create_args(vec![x, d]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "9,007,199,254,740,993");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_format_decimal_places_u64_overflow_clamps() {
+        use datafusion_common::arrow::array::UInt64Array;
+
+        let function = FormatFunction::default();
+
+        let x = Arc::new(Int64Array::from(vec![1]));
+        let d = Arc::new(UInt64Array::from(vec![u64::MAX]));
+
+        let args = create_args(vec![x, d]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), format!("1.{}", "0".repeat(30)));
+        } else {
+            panic!("Expected array result");
+        }
+    }
+}
--- a/src/common/function/src/scalars/string/insert.rs
+++ b/src/common/function/src/scalars/string/insert.rs
@@ -0,0 +1,345 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MySQL-compatible INSERT function implementation.
+//!
+//! INSERT(str, pos, len, newstr) - Inserts newstr into str at position pos,
+//! replacing len characters.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion_common::DataFusionError;
+use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, LargeStringBuilder};
+use datafusion_common::arrow::compute::cast;
+use datafusion_common::arrow::datatypes::DataType;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
+
+use crate::function::Function;
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "insert";
+
+/// MySQL-compatible INSERT function.
+///
+/// Syntax: INSERT(str, pos, len, newstr)
+/// Returns str with the substring beginning at position pos and len characters long
+/// replaced by newstr.
+///
+/// - pos is 1-based
+/// - If pos is out of range, returns the original string
+/// - If len is out of range, replaces from pos to end of string
+#[derive(Debug)]
+pub struct InsertFunction {
+    signature: Signature,
+}
+
+impl InsertFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register_scalar(InsertFunction::default());
+    }
+}
+
+impl Default for InsertFunction {
+    fn default() -> Self {
+        let mut signatures = Vec::new();
+        let string_types = [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View];
+        let int_types = [
+            DataType::Int64,
+            DataType::Int32,
+            DataType::Int16,
+            DataType::Int8,
+            DataType::UInt64,
+            DataType::UInt32,
+            DataType::UInt16,
+            DataType::UInt8,
+        ];
+
+        for str_type in &string_types {
+            for newstr_type in &string_types {
+                for pos_type in &int_types {
+                    for len_type in &int_types {
+                        signatures.push(TypeSignature::Exact(vec![
+                            str_type.clone(),
+                            pos_type.clone(),
+                            len_type.clone(),
+                            newstr_type.clone(),
+                        ]));
+                    }
+                }
+            }
+        }
+
+        Self {
+            signature: Signature::one_of(signatures, Volatility::Immutable),
+        }
+    }
+}
+
+impl fmt::Display for InsertFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for InsertFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::LargeUtf8)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        if args.args.len() != 4 {
+            return Err(DataFusionError::Execution(
+                "INSERT requires exactly 4 arguments: INSERT(str, pos, len, newstr)".to_string(),
+            ));
+        }
+
+        let arrays = ColumnarValue::values_to_arrays(&args.args)?;
+        let len = arrays[0].len();
+
+        // Cast string arguments to LargeUtf8
+        let str_array = cast_to_large_utf8(&arrays[0], "str")?;
+        let newstr_array = cast_to_large_utf8(&arrays[3], "newstr")?;
+        let pos_array = cast_to_int64(&arrays[1], "pos")?;
+        let replace_len_array = cast_to_int64(&arrays[2], "len")?;
+
+        let str_arr = str_array.as_string::<i64>();
+        let pos_arr = pos_array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+        let len_arr =
+            replace_len_array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+        let newstr_arr = newstr_array.as_string::<i64>();
+
+        let mut builder = LargeStringBuilder::with_capacity(len, len * 32);
+
+        for i in 0..len {
+            // Check for NULLs
+            if str_arr.is_null(i)
+                || pos_array.is_null(i)
+                || replace_len_array.is_null(i)
+                || newstr_arr.is_null(i)
+            {
+                builder.append_null();
+                continue;
+            }
+
+            let original = str_arr.value(i);
+            let pos = pos_arr.value(i);
+            let replace_len = len_arr.value(i);
+            let new_str = newstr_arr.value(i);
+
+            let result = insert_string(original, pos, replace_len, new_str);
+            builder.append_value(&result);
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+/// Cast array to LargeUtf8 for uniform string access.
+fn cast_to_large_utf8(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
+    cast(array.as_ref(), &DataType::LargeUtf8)
+        .map_err(|e| DataFusionError::Execution(format!("INSERT: {} cast failed: {}", name, e)))
+}
+
+fn cast_to_int64(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
+    cast(array.as_ref(), &DataType::Int64)
+        .map_err(|e| DataFusionError::Execution(format!("INSERT: {} cast failed: {}", name, e)))
+}
+
+/// Perform the INSERT string operation.
+/// pos is 1-based. If pos < 1 or pos > len(str) + 1, returns original string.
+fn insert_string(original: &str, pos: i64, replace_len: i64, new_str: &str) -> String {
+    let char_count = original.chars().count();
+
+    // MySQL behavior: if pos < 1 or pos > string length + 1, return original
+    if pos < 1 || pos as usize > char_count + 1 {
+        return original.to_string();
+    }
+
+    let start_idx = (pos - 1) as usize; // Convert to 0-based
+
+    // Calculate end index for replacement
+    let replace_len = if replace_len < 0 {
+        0
+    } else {
+        replace_len as usize
+    };
+    let end_idx = (start_idx + replace_len).min(char_count);
+
+    let start_byte = char_to_byte_idx(original, start_idx);
+    let end_byte = char_to_byte_idx(original, end_idx);
+
+    let mut result = String::with_capacity(original.len() + new_str.len());
+    result.push_str(&original[..start_byte]);
+    result.push_str(new_str);
+    result.push_str(&original[end_byte..]);
+    result
+}
+
+fn char_to_byte_idx(s: &str, char_idx: usize) -> usize {
+    s.char_indices()
+        .nth(char_idx)
+        .map(|(idx, _)| idx)
+        .unwrap_or(s.len())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::arrow::array::{Int64Array, StringArray};
+    use datafusion_common::arrow::datatypes::Field;
+    use datafusion_expr::ScalarFunctionArgs;
+
+    use super::*;
+
+    fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
+        let arg_fields: Vec<_> = arrays
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                Arc::new(Field::new(
+                    format!("arg_{}", i),
+                    arr.data_type().clone(),
+                    true,
+                ))
+            })
+            .collect();
+
+        ScalarFunctionArgs {
+            args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
+            arg_fields,
+            return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
+            number_rows: arrays[0].len(),
+            config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
+        }
+    }
+
+    #[test]
+    fn test_insert_basic() {
+        let function = InsertFunction::default();
+
+        // INSERT('Quadratic', 3, 4, 'What') => 'QuWhattic'
+        let str_arr = Arc::new(StringArray::from(vec!["Quadratic"]));
+        let pos = Arc::new(Int64Array::from(vec![3]));
+        let len = Arc::new(Int64Array::from(vec![4]));
+        let newstr = Arc::new(StringArray::from(vec!["What"]));
+
+        let args = create_args(vec![str_arr, pos, len, newstr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "QuWhattic");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_insert_out_of_range_pos() {
+        let function = InsertFunction::default();
+
+        // INSERT('Quadratic', 0, 4, 'What') => 'Quadratic' (pos < 1)
+        let str_arr = Arc::new(StringArray::from(vec!["Quadratic", "Quadratic"]));
+        let pos = Arc::new(Int64Array::from(vec![0, 100]));
+        let len = Arc::new(Int64Array::from(vec![4, 4]));
+        let newstr = Arc::new(StringArray::from(vec!["What", "What"]));
+
+        let args = create_args(vec![str_arr, pos, len, newstr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "Quadratic"); // pos < 1
+            assert_eq!(str_array.value(1), "Quadratic"); // pos > length
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_insert_replace_to_end() {
+        let function = InsertFunction::default();
+
+        // INSERT('Quadratic', 3, 100, 'What') => 'QuWhat' (len exceeds remaining)
+        let str_arr = Arc::new(StringArray::from(vec!["Quadratic"]));
+        let pos = Arc::new(Int64Array::from(vec![3]));
+        let len = Arc::new(Int64Array::from(vec![100]));
+        let newstr = Arc::new(StringArray::from(vec!["What"]));
+
+        let args = create_args(vec![str_arr, pos, len, newstr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "QuWhat");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_insert_unicode() {
+        let function = InsertFunction::default();
+
+        // INSERT('hello世界', 6, 1, 'の') => 'helloの界'
+        let str_arr = Arc::new(StringArray::from(vec!["hello世界"]));
+        let pos = Arc::new(Int64Array::from(vec![6]));
+        let len = Arc::new(Int64Array::from(vec![1]));
+        let newstr = Arc::new(StringArray::from(vec!["の"]));
+
+        let args = create_args(vec![str_arr, pos, len, newstr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "helloの界");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_insert_with_nulls() {
+        let function = InsertFunction::default();
+
+        let str_arr = Arc::new(StringArray::from(vec![Some("hello"), None]));
+        let pos = Arc::new(Int64Array::from(vec![1, 1]));
+        let len = Arc::new(Int64Array::from(vec![1, 1]));
+        let newstr = Arc::new(StringArray::from(vec!["X", "X"]));
+
+        let args = create_args(vec![str_arr, pos, len, newstr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "Xello");
+            assert!(str_array.is_null(1));
+        } else {
+            panic!("Expected array result");
+        }
+    }
+}
--- a/src/common/function/src/scalars/string/locate.rs
+++ b/src/common/function/src/scalars/string/locate.rs
@@ -0,0 +1,373 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MySQL-compatible LOCATE function implementation.
+//!
+//! LOCATE(substr, str) - Returns the position of the first occurrence of substr in str (1-based).
+//! LOCATE(substr, str, pos) - Returns the position of the first occurrence of substr in str,
+//!                            starting from position pos.
+//! Returns 0 if substr is not found.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion_common::DataFusionError;
+use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, Int64Builder};
+use datafusion_common::arrow::compute::cast;
+use datafusion_common::arrow::datatypes::DataType;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
+
+use crate::function::Function;
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "locate";
+
+/// MySQL-compatible LOCATE function.
+///
+/// Syntax:
+/// - LOCATE(substr, str) - Returns 1-based position of substr in str, or 0 if not found.
+/// - LOCATE(substr, str, pos) - Same, but starts searching from position pos.
+#[derive(Debug)]
+pub struct LocateFunction {
+    signature: Signature,
+}
+
+impl LocateFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register_scalar(LocateFunction::default());
+    }
+}
+
+impl Default for LocateFunction {
+    fn default() -> Self {
+        // Support 2 or 3 arguments with various string types
+        let mut signatures = Vec::new();
+        let string_types = [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View];
+        let int_types = [
+            DataType::Int64,
+            DataType::Int32,
+            DataType::Int16,
+            DataType::Int8,
+            DataType::UInt64,
+            DataType::UInt32,
+            DataType::UInt16,
+            DataType::UInt8,
+        ];
+
+        // 2-argument form: LOCATE(substr, str)
+        for substr_type in &string_types {
+            for str_type in &string_types {
+                signatures.push(TypeSignature::Exact(vec![
+                    substr_type.clone(),
+                    str_type.clone(),
+                ]));
+            }
+        }
+
+        // 3-argument form: LOCATE(substr, str, pos)
+        for substr_type in &string_types {
+            for str_type in &string_types {
+                for pos_type in &int_types {
+                    signatures.push(TypeSignature::Exact(vec![
+                        substr_type.clone(),
+                        str_type.clone(),
+                        pos_type.clone(),
+                    ]));
+                }
+            }
+        }
+
+        Self {
+            signature: Signature::one_of(signatures, Volatility::Immutable),
+        }
+    }
+}
+
+impl fmt::Display for LocateFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for LocateFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::Int64)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let arg_count = args.args.len();
+        if !(2..=3).contains(&arg_count) {
+            return Err(DataFusionError::Execution(
+                "LOCATE requires 2 or 3 arguments: LOCATE(substr, str) or LOCATE(substr, str, pos)"
+                    .to_string(),
+            ));
+        }
+
+        let arrays = ColumnarValue::values_to_arrays(&args.args)?;
+
+        // Cast string arguments to LargeUtf8 for uniform access
+        let substr_array = cast_to_large_utf8(&arrays[0], "substr")?;
+        let str_array = cast_to_large_utf8(&arrays[1], "str")?;
+
+        let substr = substr_array.as_string::<i64>();
+        let str_arr = str_array.as_string::<i64>();
+        let len = substr.len();
+
+        // Handle optional pos argument
+        let pos_array: Option<ArrayRef> = if arg_count == 3 {
+            Some(cast_to_int64(&arrays[2], "pos")?)
+        } else {
+            None
+        };
+
+        let mut builder = Int64Builder::with_capacity(len);
+
+        for i in 0..len {
+            if substr.is_null(i) || str_arr.is_null(i) {
+                builder.append_null();
+                continue;
+            }
+
+            let needle = substr.value(i);
+            let haystack = str_arr.value(i);
+
+            // Get starting position (1-based in MySQL, convert to 0-based)
+            let start_pos = if let Some(ref pos_arr) = pos_array {
+                if pos_arr.is_null(i) {
+                    builder.append_null();
+                    continue;
+                }
+                let pos = pos_arr
+                    .as_primitive::<datafusion_common::arrow::datatypes::Int64Type>()
+                    .value(i);
+                if pos < 1 {
+                    // MySQL returns 0 for pos < 1
+                    builder.append_value(0);
+                    continue;
+                }
+                (pos - 1) as usize
+            } else {
+                0
+            };
+
+            // Find position using character-based indexing (for Unicode support)
+            let result = locate_substr(haystack, needle, start_pos);
+            builder.append_value(result);
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+/// Cast array to LargeUtf8 for uniform string access.
+fn cast_to_large_utf8(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
+    cast(array.as_ref(), &DataType::LargeUtf8)
+        .map_err(|e| DataFusionError::Execution(format!("LOCATE: {} cast failed: {}", name, e)))
+}
+
+fn cast_to_int64(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
+    cast(array.as_ref(), &DataType::Int64)
+        .map_err(|e| DataFusionError::Execution(format!("LOCATE: {} cast failed: {}", name, e)))
+}
+
+/// Find the 1-based position of needle in haystack, starting from start_pos (0-based character index).
+/// Returns 0 if not found.
+fn locate_substr(haystack: &str, needle: &str, start_pos: usize) -> i64 {
+    // Handle empty needle - MySQL returns start_pos + 1
+    if needle.is_empty() {
+        let char_count = haystack.chars().count();
+        return if start_pos <= char_count {
+            (start_pos + 1) as i64
+        } else {
+            0
+        };
+    }
+
+    // Convert start_pos (character index) to byte index
+    let byte_start = haystack
+        .char_indices()
+        .nth(start_pos)
+        .map(|(idx, _)| idx)
+        .unwrap_or(haystack.len());
+
+    if byte_start >= haystack.len() {
+        return 0;
+    }
+
+    // Search in the substring
+    let search_str = &haystack[byte_start..];
+    if let Some(byte_pos) = search_str.find(needle) {
+        // Convert byte position back to character position
+        let char_pos = search_str[..byte_pos].chars().count();
+        // Return 1-based position relative to original string
+        (start_pos + char_pos + 1) as i64
+    } else {
+        0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::arrow::array::StringArray;
+    use datafusion_common::arrow::datatypes::Field;
+    use datafusion_expr::ScalarFunctionArgs;
+
+    use super::*;
+
+    fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
+        let arg_fields: Vec<_> = arrays
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                Arc::new(Field::new(
+                    format!("arg_{}", i),
+                    arr.data_type().clone(),
+                    true,
+                ))
+            })
+            .collect();
+
+        ScalarFunctionArgs {
+            args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
+            arg_fields,
+            return_field: Arc::new(Field::new("result", DataType::Int64, true)),
+            number_rows: arrays[0].len(),
+            config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
+        }
+    }
+
+    #[test]
+    fn test_locate_basic() {
+        let function = LocateFunction::default();
+
+        let substr = Arc::new(StringArray::from(vec!["world", "xyz", "hello"]));
+        let str_arr = Arc::new(StringArray::from(vec![
+            "hello world",
+            "hello world",
+            "hello world",
+        ]));
+
+        let args = create_args(vec![substr, str_arr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 7); // "world" at position 7
+            assert_eq!(int_array.value(1), 0); // "xyz" not found
+            assert_eq!(int_array.value(2), 1); // "hello" at position 1
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_locate_with_position() {
+        let function = LocateFunction::default();
+
+        let substr = Arc::new(StringArray::from(vec!["o", "o", "o"]));
+        let str_arr = Arc::new(StringArray::from(vec![
+            "hello world",
+            "hello world",
+            "hello world",
+        ]));
+        let pos = Arc::new(datafusion_common::arrow::array::Int64Array::from(vec![
+            1, 5, 8,
+        ]));
+
+        let args = create_args(vec![substr, str_arr, pos]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 5); // first 'o' at position 5
+            assert_eq!(int_array.value(1), 5); // 'o' at position 5 (start from 5)
+            assert_eq!(int_array.value(2), 8); // 'o' in "world" at position 8
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_locate_unicode() {
+        let function = LocateFunction::default();
+
+        let substr = Arc::new(StringArray::from(vec!["世", "界"]));
+        let str_arr = Arc::new(StringArray::from(vec!["hello世界", "hello世界"]));
+
+        let args = create_args(vec![substr, str_arr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 6); // "世" at position 6
+            assert_eq!(int_array.value(1), 7); // "界" at position 7
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_locate_empty_needle() {
+        let function = LocateFunction::default();
+
+        let substr = Arc::new(StringArray::from(vec!["", ""]));
+        let str_arr = Arc::new(StringArray::from(vec!["hello", "hello"]));
+        let pos = Arc::new(datafusion_common::arrow::array::Int64Array::from(vec![
+            1, 3,
+        ]));
+
+        let args = create_args(vec![substr, str_arr, pos]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 1); // empty string at pos 1
+            assert_eq!(int_array.value(1), 3); // empty string at pos 3
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_locate_with_nulls() {
+        let function = LocateFunction::default();
+
+        let substr = Arc::new(StringArray::from(vec![Some("o"), None]));
+        let str_arr = Arc::new(StringArray::from(vec![Some("hello"), Some("hello")]));
+
+        let args = create_args(vec![substr, str_arr]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
+            assert_eq!(int_array.value(0), 5);
+            assert!(int_array.is_null(1));
+        } else {
+            panic!("Expected array result");
+        }
+    }
+}
--- a/src/common/function/src/scalars/string/space.rs
+++ b/src/common/function/src/scalars/string/space.rs
@@ -0,0 +1,252 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MySQL-compatible SPACE function implementation.
+//!
+//! SPACE(N) - Returns a string consisting of N space characters.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion_common::DataFusionError;
+use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
+use datafusion_common::arrow::datatypes::DataType;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
+
+use crate::function::Function;
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "space";
+
+// Safety limit for maximum number of spaces
+const MAX_SPACE_COUNT: i64 = 1024 * 1024; // 1MB of spaces
+
+/// MySQL-compatible SPACE function.
+///
+/// Syntax: SPACE(N)
+/// Returns a string consisting of N space characters.
+/// Returns NULL if N is NULL.
+/// Returns empty string if N < 0.
+#[derive(Debug)]
+pub struct SpaceFunction {
+    signature: Signature,
+}
+
+impl SpaceFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register_scalar(SpaceFunction::default());
+    }
+}
+
+impl Default for SpaceFunction {
+    fn default() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                    TypeSignature::Exact(vec![DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::UInt64]),
+                    TypeSignature::Exact(vec![DataType::UInt32]),
+                    TypeSignature::Exact(vec![DataType::UInt16]),
+                    TypeSignature::Exact(vec![DataType::UInt8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl fmt::Display for SpaceFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for SpaceFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::LargeUtf8)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        if args.args.len() != 1 {
+            return Err(DataFusionError::Execution(
+                "SPACE requires exactly 1 argument: SPACE(N)".to_string(),
+            ));
+        }
+
+        let arrays = ColumnarValue::values_to_arrays(&args.args)?;
+        let len = arrays[0].len();
+        let n_array = &arrays[0];
+
+        let mut builder = LargeStringBuilder::with_capacity(len, len * 10);
+
+        for i in 0..len {
+            if n_array.is_null(i) {
+                builder.append_null();
+                continue;
+            }
+
+            let n = get_int_value(n_array, i)?;
+
+            if n < 0 {
+                // MySQL returns empty string for negative values
+                builder.append_value("");
+            } else if n > MAX_SPACE_COUNT {
+                return Err(DataFusionError::Execution(format!(
+                    "SPACE: requested {} spaces exceeds maximum allowed ({})",
+                    n, MAX_SPACE_COUNT
+                )));
+            } else {
+                let spaces = " ".repeat(n as usize);
+                builder.append_value(&spaces);
+            }
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+/// Extract integer value from various integer types.
+fn get_int_value(
+    array: &datafusion_common::arrow::array::ArrayRef,
+    index: usize,
+) -> datafusion_common::Result<i64> {
+    use datafusion_common::arrow::datatypes as arrow_types;
+
+    match array.data_type() {
+        DataType::Int64 => Ok(array.as_primitive::<arrow_types::Int64Type>().value(index)),
+        DataType::Int32 => Ok(array.as_primitive::<arrow_types::Int32Type>().value(index) as i64),
+        DataType::Int16 => Ok(array.as_primitive::<arrow_types::Int16Type>().value(index) as i64),
+        DataType::Int8 => Ok(array.as_primitive::<arrow_types::Int8Type>().value(index) as i64),
+        DataType::UInt64 => {
+            let v = array.as_primitive::<arrow_types::UInt64Type>().value(index);
+            if v > i64::MAX as u64 {
+                Err(DataFusionError::Execution(format!(
+                    "SPACE: value {} exceeds maximum",
+                    v
+                )))
+            } else {
+                Ok(v as i64)
+            }
+        }
+        DataType::UInt32 => Ok(array.as_primitive::<arrow_types::UInt32Type>().value(index) as i64),
+        DataType::UInt16 => Ok(array.as_primitive::<arrow_types::UInt16Type>().value(index) as i64),
+        DataType::UInt8 => Ok(array.as_primitive::<arrow_types::UInt8Type>().value(index) as i64),
+        _ => Err(DataFusionError::Execution(format!(
+            "SPACE: unsupported type {:?}",
+            array.data_type()
+        ))),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::arrow::array::Int64Array;
+    use datafusion_common::arrow::datatypes::Field;
+    use datafusion_expr::ScalarFunctionArgs;
+
+    use super::*;
+
+    fn create_args(arrays: Vec<datafusion_common::arrow::array::ArrayRef>) -> ScalarFunctionArgs {
+        let arg_fields: Vec<_> = arrays
+            .iter()
+            .enumerate()
+            .map(|(i, arr)| {
+                Arc::new(Field::new(
+                    format!("arg_{}", i),
+                    arr.data_type().clone(),
+                    true,
+                ))
+            })
+            .collect();
+
+        ScalarFunctionArgs {
+            args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
+            arg_fields,
+            return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
+            number_rows: arrays[0].len(),
+            config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
+        }
+    }
+
+    #[test]
+    fn test_space_basic() {
+        let function = SpaceFunction::default();
+
+        let n = Arc::new(Int64Array::from(vec![0, 1, 5]));
+
+        let args = create_args(vec![n]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "");
+            assert_eq!(str_array.value(1), " ");
+            assert_eq!(str_array.value(2), "     ");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_space_negative() {
+        let function = SpaceFunction::default();
+
+        let n = Arc::new(Int64Array::from(vec![-1, -100]));
+
+        let args = create_args(vec![n]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "");
+            assert_eq!(str_array.value(1), "");
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_space_with_nulls() {
+        let function = SpaceFunction::default();
+
+        let n = Arc::new(Int64Array::from(vec![Some(3), None]));
+
+        let args = create_args(vec![n]);
+        let result = function.invoke_with_args(args).unwrap();
+
+        if let ColumnarValue::Array(array) = result {
+            let str_array = array.as_string::<i64>();
+            assert_eq!(str_array.value(0), "   ");
+            assert!(str_array.is_null(1));
+        } else {
+            panic!("Expected array result");
+        }
+    }
+}
--- a/src/common/function/src/scalars/vector/convert/vector_to_string.rs
+++ b/src/common/function/src/scalars/vector/convert/vector_to_string.rs
@@ -19,8 +19,10 @@ use datafusion_common::DataFusionError;
 use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder};
 use datafusion_common::arrow::compute;
 use datafusion_common::arrow::datatypes::DataType;
-use datafusion_expr::type_coercion::aggregates::BINARYS;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
+use datafusion_common::types::logical_binary;
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, Signature, TypeSignatureClass, Volatility,
+};
 use datatypes::types::vector_type_value_to_string;

 use crate::function::{Function, extract_args};
@@ -35,11 +37,10 @@ pub struct VectorToStringFunction {
 impl Default for VectorToStringFunction {
    fn default() -> Self {
        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Uniform(1, vec![DataType::BinaryView]),
-                    TypeSignature::Uniform(1, BINARYS.to_vec()),
-                ],
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Native(
+                    logical_binary(),
+                ))],
                Volatility::Immutable,
            ),
        }
--- a/src/common/function/src/scalars/vector/elem_avg.rs
+++ b/src/common/function/src/scalars/vector/elem_avg.rs
@@ -15,10 +15,10 @@
 use std::fmt::Display;

 use datafusion::arrow::datatypes::DataType;
-use datafusion::logical_expr::ColumnarValue;
+use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignature, TypeSignatureClass};
 use datafusion_common::ScalarValue;
-use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS};
-use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
+use datafusion_common::types::{logical_binary, logical_string};
+use datafusion_expr::{ScalarFunctionArgs, Signature, Volatility};
 use nalgebra::DVectorView;

 use crate::function::Function;
@@ -36,9 +36,12 @@ impl Default for ElemAvgFunction {
        Self {
            signature: Signature::one_of(
                vec![
-                    TypeSignature::Uniform(1, STRINGS.to_vec()),
-                    TypeSignature::Uniform(1, BINARYS.to_vec()),
-                    TypeSignature::Uniform(1, vec![DataType::BinaryView]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_binary()),
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
+                    )]),
                ],
                Volatility::Immutable,
            ),
--- a/src/common/function/src/scalars/vector/elem_product.rs
+++ b/src/common/function/src/scalars/vector/elem_product.rs
@@ -15,10 +15,10 @@
 use std::fmt::Display;

 use datafusion::arrow::datatypes::DataType;
-use datafusion::logical_expr::ColumnarValue;
-use datafusion::logical_expr_common::type_coercion::aggregates::{BINARYS, STRINGS};
+use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignature, TypeSignatureClass};
 use datafusion_common::ScalarValue;
-use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
+use datafusion_common::types::{logical_binary, logical_string};
+use datafusion_expr::{ScalarFunctionArgs, Signature, Volatility};
 use nalgebra::DVectorView;

 use crate::function::Function;
@@ -49,9 +49,12 @@ impl Default for ElemProductFunction {
        Self {
            signature: Signature::one_of(
                vec![
-                    TypeSignature::Uniform(1, STRINGS.to_vec()),
-                    TypeSignature::Uniform(1, BINARYS.to_vec()),
-                    TypeSignature::Uniform(1, vec![DataType::BinaryView]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_binary()),
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
+                    )]),
                ],
                Volatility::Immutable,
            ),
--- a/src/common/function/src/scalars/vector/elem_sum.rs
+++ b/src/common/function/src/scalars/vector/elem_sum.rs
@@ -15,9 +15,9 @@
 use std::fmt::Display;

 use datafusion::arrow::datatypes::DataType;
-use datafusion::logical_expr::ColumnarValue;
+use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignatureClass};
 use datafusion_common::ScalarValue;
-use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS};
+use datafusion_common::types::{logical_binary, logical_string};
 use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
 use nalgebra::DVectorView;

@@ -36,9 +36,12 @@ impl Default for ElemSumFunction {
        Self {
            signature: Signature::one_of(
                vec![
-                    TypeSignature::Uniform(1, STRINGS.to_vec()),
-                    TypeSignature::Uniform(1, BINARYS.to_vec()),
-                    TypeSignature::Uniform(1, vec![DataType::BinaryView]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_binary()),
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
+                    )]),
                ],
                Volatility::Immutable,
            ),
--- a/src/common/function/src/scalars/vector/vector_dim.rs
+++ b/src/common/function/src/scalars/vector/vector_dim.rs
@@ -15,9 +15,9 @@
 use std::fmt::Display;

 use datafusion::arrow::datatypes::DataType;
-use datafusion::logical_expr::ColumnarValue;
-use datafusion::logical_expr_common::type_coercion::aggregates::{BINARYS, STRINGS};
+use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignatureClass};
 use datafusion_common::ScalarValue;
+use datafusion_common::types::{logical_binary, logical_string};
 use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};

 use crate::function::Function;
@@ -49,8 +49,12 @@ impl Default for VectorDimFunction {
        Self {
            signature: Signature::one_of(
                vec![
-                    TypeSignature::Uniform(1, STRINGS.to_vec()),
-                    TypeSignature::Uniform(1, BINARYS.to_vec()),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_binary()),
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
+                    )]),
                ],
                Volatility::Immutable,
            ),
--- a/src/common/function/src/scalars/vector/vector_norm.rs
+++ b/src/common/function/src/scalars/vector/vector_norm.rs
@@ -15,9 +15,9 @@
 use std::fmt::Display;

 use datafusion::arrow::datatypes::DataType;
-use datafusion::logical_expr::ColumnarValue;
-use datafusion::logical_expr_common::type_coercion::aggregates::{BINARYS, STRINGS};
+use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignatureClass};
 use datafusion_common::ScalarValue;
+use datafusion_common::types::{logical_binary, logical_string};
 use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
 use nalgebra::DVectorView;

@@ -52,9 +52,12 @@ impl Default for VectorNormFunction {
        Self {
            signature: Signature::one_of(
                vec![
-                    TypeSignature::Uniform(1, STRINGS.to_vec()),
-                    TypeSignature::Uniform(1, BINARYS.to_vec()),
-                    TypeSignature::Uniform(1, vec![DataType::BinaryView]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_binary()),
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
+                    )]),
                ],
                Volatility::Immutable,
            ),
--- a/src/common/function/src/system/procedure_state.rs
+++ b/src/common/function/src/system/procedure_state.rs
@@ -106,7 +106,8 @@ mod tests {
        assert!(matches!(f.signature(),
                         datafusion_expr::Signature {
                             type_signature: datafusion_expr::TypeSignature::Uniform(1, valid_types),
-                             volatility: datafusion_expr::Volatility::Immutable
+                             volatility: datafusion_expr::Volatility::Immutable,
+                             ..
                         } if valid_types == &vec![ArrowDataType::Utf8]));
    }

--- a/src/common/grpc-expr/src/alter.rs
+++ b/src/common/grpc-expr/src/alter.rs
@@ -34,7 +34,7 @@ use table::requests::{
 };

 use crate::error::{
-    ColumnNotFoundSnafu, InvalidColumnDefSnafu, InvalidIndexOptionSnafu,
+    self, ColumnNotFoundSnafu, InvalidColumnDefSnafu, InvalidIndexOptionSnafu,
    InvalidSetFulltextOptionRequestSnafu, InvalidSetSkippingIndexOptionRequestSnafu,
    InvalidSetTableOptionRequestSnafu, InvalidUnsetTableOptionRequestSnafu,
    MissingAlterIndexOptionSnafu, MissingFieldSnafu, MissingTableMetaSnafu,
@@ -251,6 +251,10 @@ pub fn alter_expr_to_request(
                .collect::<Result<Vec<_>>>()?;
            AlterKind::SetDefaults { defaults }
        }
+        Kind::Repartition(_) => error::UnexpectedSnafu {
+            err_msg: "Repartition operation should be handled through DdlManager and not converted to AlterTableRequest",
+        }
+        .fail()?,
    };

    let request = AlterTableRequest {
--- a/src/common/grpc-expr/src/error.rs
+++ b/src/common/grpc-expr/src/error.rs
@@ -161,6 +161,13 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Unexpected: {err_msg}"))]
+    Unexpected {
+        err_msg: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -188,6 +195,7 @@ impl ErrorExt for Error {
            Error::ColumnNotFound { .. } => StatusCode::TableColumnNotFound,
            Error::SqlCommon { source, .. } => source.status_code(),
            Error::MissingTableMeta { .. } => StatusCode::Unexpected,
+            Error::Unexpected { .. } => StatusCode::Unexpected,
        }
    }

--- a/src/common/grpc/src/flight.rs
+++ b/src/common/grpc/src/flight.rs
@@ -103,10 +103,11 @@ impl FlightEncoder {
            FlightMessage::RecordBatch(record_batch) => {
                let (encoded_dictionaries, encoded_batch) = self
                    .data_gen
-                    .encoded_batch(
+                    .encode(
                        &record_batch,
                        &mut self.dictionary_tracker,
                        &self.write_options,
+                        &mut Default::default(),
                    )
                    .expect("DictionaryTracker configured above to not fail on replacement");

--- a/src/common/memory-manager/src/guard.rs
+++ b/src/common/memory-manager/src/guard.rs
@@ -15,9 +15,14 @@
 use std::{fmt, mem};

 use common_telemetry::debug;
+use snafu::ensure;
 use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};

+use crate::error::{
+    MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
+};
 use crate::manager::{MemoryMetrics, MemoryQuota};
+use crate::policy::OnExhaustedPolicy;

 /// Guard representing a slice of reserved memory.
 pub struct MemoryGuard<M: MemoryMetrics> {
@@ -55,11 +60,52 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
        }
    }

-    /// Tries to allocate additional memory during task execution.
+    /// Acquires additional memory, waiting if necessary until enough is available.
+    ///
+    /// On success, merges the new memory into this guard.
+    ///
+    /// # Errors
+    /// - Returns error if requested bytes would exceed the manager's total limit
+    /// - Returns error if the semaphore is unexpectedly closed
+    pub async fn acquire_additional(&mut self, bytes: u64) -> Result<()> {
+        match &mut self.state {
+            GuardState::Unlimited => Ok(()),
+            GuardState::Limited { permit, quota } => {
+                if bytes == 0 {
+                    return Ok(());
+                }
+
+                let additional_permits = quota.bytes_to_permits(bytes);
+                let current_permits = permit.num_permits() as u32;
+
+                ensure!(
+                    current_permits.saturating_add(additional_permits) <= quota.limit_permits,
+                    MemoryLimitExceededSnafu {
+                        requested_bytes: bytes,
+                        limit_bytes: quota.permits_to_bytes(quota.limit_permits)
+                    }
+                );
+
+                let additional_permit = quota
+                    .semaphore
+                    .clone()
+                    .acquire_many_owned(additional_permits)
+                    .await
+                    .map_err(|_| MemorySemaphoreClosedSnafu.build())?;
+
+                permit.merge(additional_permit);
+                quota.update_in_use_metric();
+                debug!("Acquired additional {} bytes", bytes);
+                Ok(())
+            }
+        }
+    }
+
+    /// Tries to acquire additional memory without waiting.
    ///
    /// On success, merges the new memory into this guard and returns true.
    /// On failure, returns false and leaves this guard unchanged.
-    pub fn request_additional(&mut self, bytes: u64) -> bool {
+    pub fn try_acquire_additional(&mut self, bytes: u64) -> bool {
        match &mut self.state {
            GuardState::Unlimited => true,
            GuardState::Limited { permit, quota } => {
@@ -77,11 +123,11 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
                    Ok(additional_permit) => {
                        permit.merge(additional_permit);
                        quota.update_in_use_metric();
-                        debug!("Allocated additional {} bytes", bytes);
+                        debug!("Acquired additional {} bytes", bytes);
                        true
                    }
                    Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
-                        quota.metrics.inc_rejected("request_additional");
+                        quota.metrics.inc_rejected("try_acquire_additional");
                        false
                    }
                }
@@ -89,11 +135,55 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
        }
    }

-    /// Releases a portion of granted memory back to the pool early,
-    /// before the guard is dropped.
+    /// Acquires additional memory based on the given policy.
+    ///
+    /// - For `OnExhaustedPolicy::Wait`: Waits up to the timeout duration for memory to become available
+    /// - For `OnExhaustedPolicy::Fail`: Returns immediately if memory is not available
+    ///
+    /// # Errors
+    /// - `MemoryLimitExceeded`: Requested bytes would exceed the total limit (both policies), or memory is currently exhausted (Fail policy only)
+    /// - `MemoryAcquireTimeout`: Timeout elapsed while waiting for memory (Wait policy only)
+    /// - `MemorySemaphoreClosed`: The internal semaphore is unexpectedly closed (rare, indicates system issue)
+    pub async fn acquire_additional_with_policy(
+        &mut self,
+        bytes: u64,
+        policy: OnExhaustedPolicy,
+    ) -> Result<()> {
+        match policy {
+            OnExhaustedPolicy::Wait { timeout } => {
+                match tokio::time::timeout(timeout, self.acquire_additional(bytes)).await {
+                    Ok(Ok(())) => Ok(()),
+                    Ok(Err(e)) => Err(e),
+                    Err(_elapsed) => MemoryAcquireTimeoutSnafu {
+                        requested_bytes: bytes,
+                        waited: timeout,
+                    }
+                    .fail(),
+                }
+            }
+            OnExhaustedPolicy::Fail => {
+                if self.try_acquire_additional(bytes) {
+                    Ok(())
+                } else {
+                    MemoryLimitExceededSnafu {
+                        requested_bytes: bytes,
+                        limit_bytes: match &self.state {
+                            GuardState::Unlimited => 0, // unreachable: unlimited mode always succeeds
+                            GuardState::Limited { quota, .. } => {
+                                quota.permits_to_bytes(quota.limit_permits)
+                            }
+                        },
+                    }
+                    .fail()
+                }
+            }
+        }
+    }
+
+    /// Releases a portion of granted memory back to the pool before the guard is dropped.
    ///
    /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
-    pub fn early_release_partial(&mut self, bytes: u64) -> bool {
+    pub fn release_partial(&mut self, bytes: u64) -> bool {
        match &mut self.state {
            GuardState::Unlimited => true,
            GuardState::Limited { permit, quota } => {
@@ -109,7 +199,7 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
                            quota.permits_to_bytes(released_permit.num_permits() as u32);
                        drop(released_permit);
                        quota.update_in_use_metric();
-                        debug!("Early released {} bytes from memory guard", released_bytes);
+                        debug!("Released {} bytes from memory guard", released_bytes);
                        true
                    }
                    None => false,
--- a/src/common/memory-manager/src/manager.rs
+++ b/src/common/memory-manager/src/manager.rs
@@ -37,6 +37,12 @@ pub struct MemoryManager<M: MemoryMetrics> {
    quota: Option<MemoryQuota<M>>,
 }

+impl<M: MemoryMetrics + Default> Default for MemoryManager<M> {
+    fn default() -> Self {
+        Self::new(0, M::default())
+    }
+}
+
 #[derive(Clone)]
 pub(crate) struct MemoryQuota<M: MemoryMetrics> {
    pub(crate) semaphore: Arc<Semaphore>,
--- a/src/common/memory-manager/src/tests.rs
+++ b/src/common/memory-manager/src/tests.rs
@@ -83,7 +83,7 @@ fn test_request_additional_success() {
    assert_eq!(manager.used_bytes(), base);

    // Request additional memory (3MB) - should succeed and merge
-    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
 }
@@ -98,11 +98,11 @@ fn test_request_additional_exceeds_limit() {
    let mut guard = manager.try_acquire(base).unwrap();

    // Request additional memory (3MB) - should succeed
-    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);

    // Request more (3MB) - should fail (would exceed 10MB limit)
-    let result = guard.request_additional(3 * PERMIT_GRANULARITY_BYTES);
+    let result = guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES);
    assert!(!result);

    // Still at 8MB
@@ -119,7 +119,7 @@ fn test_request_additional_auto_release_on_guard_drop() {
        let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();

        // Request additional - memory is merged into guard
-        assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+        assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES));
        assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);

        // When guard drops, all memory (base + additional) is released together
@@ -135,7 +135,7 @@ fn test_request_additional_unlimited() {
    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();

    // Should always succeed with unlimited manager
-    assert!(guard.request_additional(100 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.try_acquire_additional(100 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 0);
    assert_eq!(manager.used_bytes(), 0);
 }
@@ -148,7 +148,7 @@ fn test_request_additional_zero_bytes() {
    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();

    // Request 0 bytes should succeed without affecting anything
-    assert!(guard.request_additional(0));
+    assert!(guard.try_acquire_additional(0));
    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
 }
@@ -162,7 +162,7 @@ fn test_early_release_partial_success() {
    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);

    // Release half
-    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.release_partial(4 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 4 * PERMIT_GRANULARITY_BYTES);

@@ -177,7 +177,7 @@ fn test_early_release_partial_exceeds_granted() {
    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();

    // Try to release more than granted - should fail
-    assert!(!guard.early_release_partial(10 * PERMIT_GRANULARITY_BYTES));
+    assert!(!guard.release_partial(10 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
 }
@@ -188,7 +188,7 @@ fn test_early_release_partial_unlimited() {
    let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();

    // Unlimited guard - release should succeed (no-op)
-    assert!(guard.early_release_partial(50 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.release_partial(50 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 0);
 }

@@ -200,22 +200,22 @@ fn test_request_and_early_release_symmetry() {
    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();

    // Request additional
-    assert!(guard.request_additional(5 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.try_acquire_additional(5 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);

    // Early release some
-    assert!(guard.early_release_partial(3 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.release_partial(3 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);

    // Request again
-    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.try_acquire_additional(2 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);

    // Early release again
-    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert!(guard.release_partial(4 * PERMIT_GRANULARITY_BYTES));
    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);

@@ -226,25 +226,186 @@ fn test_request_and_early_release_symmetry() {
 #[test]
 fn test_small_allocation_rounds_up() {
    // Test that allocations smaller than PERMIT_GRANULARITY_BYTES
-    // round up to 1 permit and can use request_additional()
+    // round up to 1 permit and can use try_acquire_additional()
    let limit = 10 * PERMIT_GRANULARITY_BYTES;
    let manager = MemoryManager::new(limit, NoOpMetrics);

    let mut guard = manager.try_acquire(512 * 1024).unwrap(); // 512KB
    assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES); // Rounds up to 1MB
-    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
+    assert!(guard.try_acquire_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
 }

 #[test]
 fn test_acquire_zero_bytes_lazy_allocation() {
-    // Test that acquire(0) returns 0 permits but can request_additional() later
+    // Test that acquire(0) returns 0 permits but can try_acquire_additional() later
    let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);

    let mut guard = manager.try_acquire(0).unwrap();
    assert_eq!(guard.granted_bytes(), 0); // No permits consumed
    assert_eq!(manager.used_bytes(), 0);

-    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
+    assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
 }
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_additional_blocks_and_unblocks() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // First guard takes 9MB, leaving only 1MB available
+    let mut guard1 = manager.try_acquire(9 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+
+    // Spawn a task that will block trying to acquire additional 5MB (needs total 10MB available)
+    let manager_clone = manager.clone();
+    let waiter = tokio::spawn(async move {
+        let mut guard2 = manager_clone.try_acquire(0).unwrap();
+        // This will block until enough memory is available
+        guard2
+            .acquire_additional(5 * PERMIT_GRANULARITY_BYTES)
+            .await
+            .unwrap();
+        guard2
+    });
+
+    sleep(Duration::from_millis(10)).await;
+
+    // Release 5MB from guard1 - this should unblock the waiter
+    assert!(guard1.release_partial(5 * PERMIT_GRANULARITY_BYTES));
+
+    // Waiter should complete now
+    let guard2 = waiter.await.unwrap();
+    assert_eq!(guard2.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+
+    // Total: guard1 has 4MB, guard2 has 5MB = 9MB
+    assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_additional_exceeds_total_limit() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Try to acquire additional 5MB - would exceed total limit of 10MB
+    let result = guard.acquire_additional(5 * PERMIT_GRANULARITY_BYTES).await;
+    assert!(result.is_err());
+
+    // Guard should remain unchanged
+    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_additional_success() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(3 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
+
+    // Acquire additional 4MB - should succeed
+    guard
+        .acquire_additional(4 * PERMIT_GRANULARITY_BYTES)
+        .await
+        .unwrap();
+    assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_additional_with_policy_wait_success() {
+    use crate::policy::OnExhaustedPolicy;
+
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard1 = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    let manager_clone = manager.clone();
+    let waiter = tokio::spawn(async move {
+        let mut guard2 = manager_clone.try_acquire(0).unwrap();
+        // Wait policy with 1 second timeout
+        guard2
+            .acquire_additional_with_policy(
+                5 * PERMIT_GRANULARITY_BYTES,
+                OnExhaustedPolicy::Wait {
+                    timeout: Duration::from_secs(1),
+                },
+            )
+            .await
+            .unwrap();
+        guard2
+    });
+
+    sleep(Duration::from_millis(10)).await;
+
+    // Release memory to unblock waiter
+    assert!(guard1.release_partial(5 * PERMIT_GRANULARITY_BYTES));
+
+    let guard2 = waiter.await.unwrap();
+    assert_eq!(guard2.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_additional_with_policy_wait_timeout() {
+    use crate::policy::OnExhaustedPolicy;
+
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // Take all memory
+    let _guard1 = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    let mut guard2 = manager.try_acquire(0).unwrap();
+
+    // Try to acquire with short timeout - should timeout
+    let result = guard2
+        .acquire_additional_with_policy(
+            5 * PERMIT_GRANULARITY_BYTES,
+            OnExhaustedPolicy::Wait {
+                timeout: Duration::from_millis(50),
+            },
+        )
+        .await;
+
+    assert!(result.is_err());
+    assert_eq!(guard2.granted_bytes(), 0);
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_additional_with_policy_fail() {
+    use crate::policy::OnExhaustedPolicy;
+
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let _guard1 = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    let mut guard2 = manager.try_acquire(0).unwrap();
+
+    // Fail policy - should return error immediately
+    let result = guard2
+        .acquire_additional_with_policy(5 * PERMIT_GRANULARITY_BYTES, OnExhaustedPolicy::Fail)
+        .await;
+
+    assert!(result.is_err());
+    assert_eq!(guard2.granted_bytes(), 0);
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_additional_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics); // Unlimited
+    let mut guard = manager.try_acquire(0).unwrap();
+
+    // Should always succeed with unlimited manager
+    guard
+        .acquire_additional(1000 * PERMIT_GRANULARITY_BYTES)
+        .await
+        .unwrap();
+    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(manager.used_bytes(), 0);
+}
--- a/src/common/meta/src/ddl.rs
+++ b/src/common/meta/src/ddl.rs
@@ -28,6 +28,7 @@ use crate::node_manager::NodeManagerRef;
 use crate::region_keeper::MemoryRegionKeeperRef;
 use crate::region_registry::LeaderRegionRegistryRef;

+pub mod allocator;
 pub mod alter_database;
 pub mod alter_logical_tables;
 pub mod alter_table;
@@ -36,8 +37,7 @@ pub mod create_database;
 pub mod create_flow;
 pub mod create_logical_tables;
 pub mod create_table;
-mod create_table_template;
-pub(crate) use create_table_template::{CreateRequestBuilder, build_template_from_raw_table_info};
+pub(crate) use create_table::{CreateRequestBuilder, build_template_from_raw_table_info};
 pub mod create_view;
 pub mod drop_database;
 pub mod drop_flow;
--- a/src/common/meta/src/ddl/allocator.rs
+++ b/src/common/meta/src/ddl/allocator.rs
@@ -0,0 +1,17 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod region_routes;
+pub mod resource_id;
+pub mod wal_options;
--- a/src/common/meta/src/ddl/allocator/region_routes.rs
+++ b/src/common/meta/src/ddl/allocator/region_routes.rs
@@ -0,0 +1,80 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_telemetry::debug;
+use store_api::storage::{RegionId, RegionNumber, TableId};
+
+use crate::error::Result;
+use crate::peer::PeerAllocator;
+use crate::rpc::router::{Region, RegionRoute};
+
+pub type RegionRoutesAllocatorRef = Arc<dyn RegionRoutesAllocator>;
+
+#[async_trait::async_trait]
+pub trait RegionRoutesAllocator: Send + Sync {
+    async fn allocate(
+        &self,
+        table_id: TableId,
+        regions_and_partitions: &[(RegionNumber, &str)],
+    ) -> Result<Vec<RegionRoute>>;
+}
+
+#[async_trait::async_trait]
+impl<T: PeerAllocator> RegionRoutesAllocator for T {
+    async fn allocate(
+        &self,
+        table_id: TableId,
+        regions_and_partitions: &[(RegionNumber, &str)],
+    ) -> Result<Vec<RegionRoute>> {
+        let regions = regions_and_partitions.len().max(1);
+        let peers = self.alloc(regions).await?;
+        debug!("Allocated peers {:?} for table {}", peers, table_id,);
+
+        let mut region_routes = regions_and_partitions
+            .iter()
+            .enumerate()
+            .map(|(i, (region_number, partition))| {
+                let region = Region {
+                    id: RegionId::new(table_id, *region_number),
+                    partition_expr: partition.to_string(),
+                    ..Default::default()
+                };
+
+                let peer = peers[i % peers.len()].clone();
+
+                RegionRoute {
+                    region,
+                    leader_peer: Some(peer),
+                    ..Default::default()
+                }
+            })
+            .collect::<Vec<_>>();
+
+        // If the table has no partitions, we need to create a default region.
+        if region_routes.is_empty() {
+            region_routes.push(RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 0),
+                    ..Default::default()
+                },
+                leader_peer: Some(peers[0].clone()),
+                ..Default::default()
+            });
+        }
+
+        Ok(region_routes)
+    }
+}
--- a/src/common/meta/src/ddl/allocator/resource_id.rs
+++ b/src/common/meta/src/ddl/allocator/resource_id.rs
@@ -0,0 +1,35 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::ops::Range;
+use std::sync::Arc;
+
+use crate::error::Result;
+
+pub type ResourceIdAllocatorRef = Arc<dyn ResourceIdAllocator>;
+
+#[async_trait::async_trait]
+pub trait ResourceIdAllocator: Send + Sync {
+    /// Returns the next value and increments the sequence.
+    async fn next(&self) -> Result<u64>;
+
+    /// Returns the current value stored in the remote storage without incrementing the sequence.
+    async fn peek(&self) -> Result<u64>;
+
+    /// Jumps to the given value.
+    async fn jump_to(&self, next: u64) -> Result<()>;
+
+    /// Returns the range of available sequences.
+    async fn min_max(&self) -> Range<u64>;
+}
--- a/src/common/meta/src/ddl/allocator/wal_options.rs
+++ b/src/common/meta/src/ddl/allocator/wal_options.rs
@@ -0,0 +1,31 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use store_api::storage::RegionNumber;
+
+use crate::error::Result;
+
+pub type WalOptionsAllocatorRef = Arc<dyn WalOptionsAllocator>;
+
+#[async_trait::async_trait]
+pub trait WalOptionsAllocator: Send + Sync {
+    async fn allocate(
+        &self,
+        region_numbers: &[RegionNumber],
+        skip_wal: bool,
+    ) -> Result<HashMap<RegionNumber, String>>;
+}
--- a/src/common/meta/src/ddl/alter_table/region_request.rs
+++ b/src/common/meta/src/ddl/alter_table/region_request.rs
@@ -22,7 +22,7 @@ use snafu::OptionExt;
 use table::metadata::RawTableInfo;

 use crate::ddl::alter_table::AlterTableProcedure;
-use crate::error::{InvalidProtoMsgSnafu, Result};
+use crate::error::{self, InvalidProtoMsgSnafu, Result};

 impl AlterTableProcedure {
    /// Makes alter kind proto that all regions can reuse.
@@ -112,6 +112,10 @@ fn create_proto_alter_kind(
        Kind::UnsetIndexes(v) => Ok(Some(alter_request::Kind::UnsetIndexes(v.clone()))),
        Kind::DropDefaults(v) => Ok(Some(alter_request::Kind::DropDefaults(v.clone()))),
        Kind::SetDefaults(v) => Ok(Some(alter_request::Kind::SetDefaults(v.clone()))),
+        Kind::Repartition(_) => error::UnexpectedSnafu {
+            err_msg: "Repartition operation should be handled through DdlManager and not converted to AlterTableRequest",
+        }
+        .fail()?,
    }
 }

--- a/src/common/meta/src/ddl/create_logical_tables.rs
+++ b/src/common/meta/src/ddl/create_logical_tables.rs
@@ -30,7 +30,7 @@ use serde::{Deserialize, Serialize};
 use snafu::ResultExt;
 use store_api::metadata::ColumnMetadata;
 use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY;
-use store_api::storage::{RegionId, RegionNumber};
+use store_api::storage::RegionNumber;
 use strum::AsRefStr;
 use table::metadata::{RawTableInfo, TableId};

@@ -286,14 +286,7 @@ impl CreateTablesData {
            .flat_map(|(task, table_id)| {
                if table_id.is_none() {
                    let table_info = task.table_info.clone();
-                    let region_ids = self
-                        .physical_region_numbers
-                        .iter()
-                        .map(|region_number| {
-                            RegionId::new(table_info.ident.table_id, *region_number)
-                        })
-                        .collect();
-                    let table_route = TableRouteValue::logical(self.physical_table_id, region_ids);
+                    let table_route = TableRouteValue::logical(self.physical_table_id);
                    Some((table_info, table_route))
                } else {
                    None
--- a/src/common/meta/src/ddl/create_logical_tables/region_request.rs
+++ b/src/common/meta/src/ddl/create_logical_tables/region_request.rs
@@ -22,7 +22,7 @@ use store_api::storage::{RegionId, TableId};
 use table::metadata::RawTableInfo;

 use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
-use crate::ddl::create_table_template::{
+use crate::ddl::create_table::template::{
    CreateRequestBuilder, build_template, build_template_from_raw_table_info,
 };
 use crate::ddl::utils::region_storage_path;
@@ -97,7 +97,7 @@ pub fn create_region_request_builder(

 /// Builds a [CreateRequestBuilder] from a [RawTableInfo].
 ///
-/// Note: **This method is only used for creating logical tables.**
+/// Note: This function is primarily intended for creating logical tables or allocating placeholder regions.
 pub fn create_region_request_builder_from_raw_table_info(
    raw_table_info: &RawTableInfo,
    physical_table_id: TableId,
--- a/src/common/meta/src/ddl/create_table.rs
+++ b/src/common/meta/src/ddl/create_table.rs
@@ -12,74 +12,99 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+pub mod executor;
+pub mod template;
+
 use std::collections::HashMap;

-use api::v1::region::region_request::Body as PbRegionRequest;
-use api::v1::region::{RegionRequest, RegionRequestHeader};
+use api::v1::CreateTableExpr;
 use async_trait::async_trait;
 use common_error::ext::BoxedError;
 use common_procedure::error::{
    ExternalSnafu, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu,
 };
 use common_procedure::{Context as ProcedureContext, LockKey, Procedure, ProcedureId, Status};
-use common_telemetry::tracing_context::TracingContext;
-use common_telemetry::{info, warn};
-use futures::future::join_all;
+use common_telemetry::info;
 use serde::{Deserialize, Serialize};
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};
 use store_api::metadata::ColumnMetadata;
-use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY;
-use store_api::storage::{RegionId, RegionNumber};
+use store_api::storage::RegionNumber;
 use strum::AsRefStr;
 use table::metadata::{RawTableInfo, TableId};
+use table::table_name::TableName;
 use table::table_reference::TableReference;
+pub(crate) use template::{CreateRequestBuilder, build_template_from_raw_table_info};

-use crate::ddl::create_table_template::{CreateRequestBuilder, build_template};
-use crate::ddl::utils::raw_table_info::update_table_info_column_ids;
-use crate::ddl::utils::{
-    add_peer_context_if_needed, convert_region_routes_to_detecting_regions,
-    extract_column_metadatas, map_to_procedure_error, region_storage_path,
-};
+use crate::ddl::create_table::executor::CreateTableExecutor;
+use crate::ddl::create_table::template::build_template;
+use crate::ddl::utils::map_to_procedure_error;
 use crate::ddl::{DdlContext, TableMetadata};
 use crate::error::{self, Result};
-use crate::key::table_name::TableNameKey;
-use crate::key::table_route::{PhysicalTableRouteValue, TableRouteValue};
+use crate::key::table_route::PhysicalTableRouteValue;
 use crate::lock_key::{CatalogLock, SchemaLock, TableNameLock};
 use crate::metrics;
 use crate::region_keeper::OperatingRegionGuard;
 use crate::rpc::ddl::CreateTableTask;
-use crate::rpc::router::{
-    RegionRoute, find_leader_regions, find_leaders, operating_leader_regions,
-};
+use crate::rpc::router::{RegionRoute, operating_leader_regions};
+
 pub struct CreateTableProcedure {
    pub context: DdlContext,
-    pub creator: TableCreator,
+    /// The serializable data.
+    pub data: CreateTableData,
+    /// The guards of opening.
+    pub opening_regions: Vec<OperatingRegionGuard>,
+    /// The executor of the procedure.
+    pub executor: CreateTableExecutor,
+}
+
+fn build_executor_from_create_table_data(
+    create_table_expr: &CreateTableExpr,
+) -> Result<CreateTableExecutor> {
+    let template = build_template(create_table_expr)?;
+    let builder = CreateRequestBuilder::new(template, None);
+    let table_name = TableName::new(
+        create_table_expr.catalog_name.clone(),
+        create_table_expr.schema_name.clone(),
+        create_table_expr.table_name.clone(),
+    );
+    let executor =
+        CreateTableExecutor::new(table_name, create_table_expr.create_if_not_exists, builder);
+    Ok(executor)
 }

 impl CreateTableProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::CreateTable";

-    pub fn new(task: CreateTableTask, context: DdlContext) -> Self {
-        Self {
+    pub fn new(task: CreateTableTask, context: DdlContext) -> Result<Self> {
+        let executor = build_executor_from_create_table_data(&task.create_table)?;
+
+        Ok(Self {
            context,
-            creator: TableCreator::new(task),
-        }
+            data: CreateTableData::new(task),
+            opening_regions: vec![],
+            executor,
+        })
    }

    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
-        let data = serde_json::from_str(json).context(FromJsonSnafu)?;
+        let data: CreateTableData = serde_json::from_str(json).context(FromJsonSnafu)?;
+        let create_table_expr = &data.task.create_table;
+        let executor = build_executor_from_create_table_data(create_table_expr)
+            .map_err(BoxedError::new)
+            .context(ExternalSnafu {
+                clean_poisons: false,
+            })?;

        Ok(CreateTableProcedure {
            context,
-            creator: TableCreator {
-                data,
-                opening_regions: vec![],
-            },
+            data,
+            opening_regions: vec![],
+            executor,
        })
    }

    fn table_info(&self) -> &RawTableInfo {
-        &self.creator.data.task.table_info
+        &self.data.task.table_info
    }

    pub(crate) fn table_id(&self) -> TableId {
@@ -87,8 +112,7 @@ impl CreateTableProcedure {
    }

    fn region_wal_options(&self) -> Result<&HashMap<RegionNumber, String>> {
-        self.creator
-            .data
+        self.data
            .region_wal_options
            .as_ref()
            .context(error::UnexpectedSnafu {
@@ -97,8 +121,7 @@ impl CreateTableProcedure {
    }

    fn table_route(&self) -> Result<&PhysicalTableRouteValue> {
-        self.creator
-            .data
+        self.data
            .table_route
            .as_ref()
            .context(error::UnexpectedSnafu {
@@ -106,17 +129,6 @@ impl CreateTableProcedure {
            })
    }

-    #[cfg(any(test, feature = "testing"))]
-    pub fn set_allocated_metadata(
-        &mut self,
-        table_id: TableId,
-        table_route: PhysicalTableRouteValue,
-        region_wal_options: HashMap<RegionNumber, String>,
-    ) {
-        self.creator
-            .set_allocated_metadata(table_id, table_route, region_wal_options)
-    }
-
    /// On the prepare step, it performs:
    /// - Checks whether the table exists.
    /// - Allocates the table id.
@@ -125,31 +137,16 @@ impl CreateTableProcedure {
    /// - TableName exists and `create_if_not_exists` is false.
    /// - Failed to allocate [TableMetadata].
    pub(crate) async fn on_prepare(&mut self) -> Result<Status> {
-        let expr = &self.creator.data.task.create_table;
-        let table_name_value = self
-            .context
-            .table_metadata_manager
-            .table_name_manager()
-            .get(TableNameKey::new(
-                &expr.catalog_name,
-                &expr.schema_name,
-                &expr.table_name,
-            ))
+        let table_id = self
+            .executor
+            .on_prepare(&self.context.table_metadata_manager)
            .await?;
-
-        if let Some(value) = table_name_value {
-            ensure!(
-                expr.create_if_not_exists,
-                error::TableAlreadyExistsSnafu {
-                    table_name: self.creator.data.table_ref().to_string(),
-                }
-            );
-
-            let table_id = value.table_id();
+        // Return the table id if the table already exists.
+        if let Some(table_id) = table_id {
            return Ok(Status::done_with_output(table_id));
        }

-        self.creator.data.state = CreateTableState::DatanodeCreateRegions;
+        self.data.state = CreateTableState::DatanodeCreateRegions;
        let TableMetadata {
            table_id,
            table_route,
@@ -157,23 +154,13 @@ impl CreateTableProcedure {
        } = self
            .context
            .table_metadata_allocator
-            .create(&self.creator.data.task)
+            .create(&self.data.task)
            .await?;
-        self.creator
-            .set_allocated_metadata(table_id, table_route, region_wal_options);
+        self.set_allocated_metadata(table_id, table_route, region_wal_options);

        Ok(Status::executing(true))
    }

-    pub fn new_region_request_builder(
-        &self,
-        physical_table_id: Option<TableId>,
-    ) -> Result<CreateRequestBuilder> {
-        let create_table_expr = &self.creator.data.task.create_table;
-        let template = build_template(create_table_expr)?;
-        Ok(CreateRequestBuilder::new(template, physical_table_id))
-    }
-
    /// Creates regions on datanodes
    ///
    /// Abort(non-retry):
@@ -187,90 +174,29 @@ impl CreateTableProcedure {
    ///   - [Code::Unavailable](tonic::status::Code::Unavailable)
    pub async fn on_datanode_create_regions(&mut self) -> Result<Status> {
        let table_route = self.table_route()?.clone();
-        let request_builder = self.new_region_request_builder(None)?;
        // Registers opening regions
-        let guards = self
-            .creator
-            .register_opening_regions(&self.context, &table_route.region_routes)?;
+        let guards = self.register_opening_regions(&self.context, &table_route.region_routes)?;
        if !guards.is_empty() {
-            self.creator.opening_regions = guards;
+            self.opening_regions = guards;
        }
-        self.create_regions(&table_route.region_routes, request_builder)
-            .await
+        self.create_regions(&table_route.region_routes).await
    }

-    async fn create_regions(
-        &mut self,
-        region_routes: &[RegionRoute],
-        request_builder: CreateRequestBuilder,
-    ) -> Result<Status> {
-        let create_table_data = &self.creator.data;
-        // Safety: the region_wal_options must be allocated
+    async fn create_regions(&mut self, region_routes: &[RegionRoute]) -> Result<Status> {
+        let table_id = self.table_id();
        let region_wal_options = self.region_wal_options()?;
-        let create_table_expr = &create_table_data.task.create_table;
-        let catalog = &create_table_expr.catalog_name;
-        let schema = &create_table_expr.schema_name;
-        let storage_path = region_storage_path(catalog, schema);
-        let leaders = find_leaders(region_routes);
-        let mut create_region_tasks = Vec::with_capacity(leaders.len());
+        let column_metadatas = self
+            .executor
+            .on_create_regions(
+                &self.context.node_manager,
+                table_id,
+                region_routes,
+                region_wal_options,
+            )
+            .await?;

-        let partition_exprs = region_routes
-            .iter()
-            .map(|r| (r.region.id.region_number(), r.region.partition_expr()))
-            .collect();
-
-        for datanode in leaders {
-            let requester = self.context.node_manager.datanode(&datanode).await;
-
-            let regions = find_leader_regions(region_routes, &datanode);
-            let mut requests = Vec::with_capacity(regions.len());
-            for region_number in regions {
-                let region_id = RegionId::new(self.table_id(), region_number);
-                let create_region_request = request_builder.build_one(
-                    region_id,
-                    storage_path.clone(),
-                    region_wal_options,
-                    &partition_exprs,
-                );
-                requests.push(PbRegionRequest::Create(create_region_request));
-            }
-
-            for request in requests {
-                let request = RegionRequest {
-                    header: Some(RegionRequestHeader {
-                        tracing_context: TracingContext::from_current_span().to_w3c(),
-                        ..Default::default()
-                    }),
-                    body: Some(request),
-                };
-
-                let datanode = datanode.clone();
-                let requester = requester.clone();
-                create_region_tasks.push(async move {
-                    requester
-                        .handle(request)
-                        .await
-                        .map_err(add_peer_context_if_needed(datanode))
-                });
-            }
-        }
-
-        let mut results = join_all(create_region_tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
-
-        if let Some(column_metadatas) =
-            extract_column_metadatas(&mut results, TABLE_COLUMN_METADATA_EXTENSION_KEY)?
-        {
-            self.creator.data.column_metadatas = column_metadatas;
-        } else {
-            warn!(
-                "creating table result doesn't contains extension key `{TABLE_COLUMN_METADATA_EXTENSION_KEY}`,leaving the table's column metadata unchanged"
-            );
-        }
-
-        self.creator.data.state = CreateTableState::CreateMetadata;
+        self.data.column_metadatas = column_metadatas;
+        self.data.state = CreateTableState::CreateMetadata;
        Ok(Status::executing(true))
    }

@@ -280,107 +206,33 @@ impl CreateTableProcedure {
    /// - Failed to create table metadata.
    async fn on_create_metadata(&mut self, pid: ProcedureId) -> Result<Status> {
        let table_id = self.table_id();
-        let table_ref = self.creator.data.table_ref();
+        let table_ref = self.data.table_ref();
        let manager = &self.context.table_metadata_manager;

-        let mut raw_table_info = self.table_info().clone();
-        if !self.creator.data.column_metadatas.is_empty() {
-            update_table_info_column_ids(&mut raw_table_info, &self.creator.data.column_metadatas);
-        }
+        let raw_table_info = self.table_info().clone();
        // Safety: the region_wal_options must be allocated.
        let region_wal_options = self.region_wal_options()?.clone();
        // Safety: the table_route must be allocated.
        let physical_table_route = self.table_route()?.clone();
-        let detecting_regions =
-            convert_region_routes_to_detecting_regions(&physical_table_route.region_routes);
-        let table_route = TableRouteValue::Physical(physical_table_route);
-        manager
-            .create_table_metadata(raw_table_info, table_route, region_wal_options)
+        self.executor
+            .on_create_metadata(
+                manager,
+                &self.context.region_failure_detector_controller,
+                raw_table_info,
+                &self.data.column_metadatas,
+                physical_table_route,
+                region_wal_options,
+            )
            .await?;
-        self.context
-            .register_failure_detectors(detecting_regions)
-            .await;
+
        info!(
            "Successfully created table: {}, table_id: {}, procedure_id: {}",
            table_ref, table_id, pid
        );

-        self.creator.opening_regions.clear();
+        self.opening_regions.clear();
        Ok(Status::done_with_output(table_id))
    }
-}
-
-#[async_trait]
-impl Procedure for CreateTableProcedure {
-    fn type_name(&self) -> &str {
-        Self::TYPE_NAME
-    }
-
-    fn recover(&mut self) -> ProcedureResult<()> {
-        // Only registers regions if the table route is allocated.
-        if let Some(x) = &self.creator.data.table_route {
-            self.creator.opening_regions = self
-                .creator
-                .register_opening_regions(&self.context, &x.region_routes)
-                .map_err(BoxedError::new)
-                .context(ExternalSnafu {
-                    clean_poisons: false,
-                })?;
-        }
-
-        Ok(())
-    }
-
-    async fn execute(&mut self, ctx: &ProcedureContext) -> ProcedureResult<Status> {
-        let state = &self.creator.data.state;
-
-        let _timer = metrics::METRIC_META_PROCEDURE_CREATE_TABLE
-            .with_label_values(&[state.as_ref()])
-            .start_timer();
-
-        match state {
-            CreateTableState::Prepare => self.on_prepare().await,
-            CreateTableState::DatanodeCreateRegions => self.on_datanode_create_regions().await,
-            CreateTableState::CreateMetadata => self.on_create_metadata(ctx.procedure_id).await,
-        }
-        .map_err(map_to_procedure_error)
-    }
-
-    fn dump(&self) -> ProcedureResult<String> {
-        serde_json::to_string(&self.creator.data).context(ToJsonSnafu)
-    }
-
-    fn lock_key(&self) -> LockKey {
-        let table_ref = &self.creator.data.table_ref();
-
-        LockKey::new(vec![
-            CatalogLock::Read(table_ref.catalog).into(),
-            SchemaLock::read(table_ref.catalog, table_ref.schema).into(),
-            TableNameLock::new(table_ref.catalog, table_ref.schema, table_ref.table).into(),
-        ])
-    }
-}
-
-pub struct TableCreator {
-    /// The serializable data.
-    pub data: CreateTableData,
-    /// The guards of opening.
-    pub opening_regions: Vec<OperatingRegionGuard>,
-}
-
-impl TableCreator {
-    pub fn new(task: CreateTableTask) -> Self {
-        Self {
-            data: CreateTableData {
-                state: CreateTableState::Prepare,
-                column_metadatas: vec![],
-                task,
-                table_route: None,
-                region_wal_options: None,
-            },
-            opening_regions: vec![],
-        }
-    }

    /// Registers and returns the guards of the opening region if they don't exist.
    fn register_opening_regions(
@@ -389,7 +241,6 @@ impl TableCreator {
        region_routes: &[RegionRoute],
    ) -> Result<Vec<OperatingRegionGuard>> {
        let opening_regions = operating_leader_regions(region_routes);
-
        if self.opening_regions.len() == opening_regions.len() {
            return Ok(vec![]);
        }
@@ -409,7 +260,7 @@ impl TableCreator {
        Ok(opening_region_guards)
    }

-    fn set_allocated_metadata(
+    pub fn set_allocated_metadata(
        &mut self,
        table_id: TableId,
        table_route: PhysicalTableRouteValue,
@@ -421,6 +272,56 @@ impl TableCreator {
    }
 }

+#[async_trait]
+impl Procedure for CreateTableProcedure {
+    fn type_name(&self) -> &str {
+        Self::TYPE_NAME
+    }
+
+    fn recover(&mut self) -> ProcedureResult<()> {
+        // Only registers regions if the table route is allocated.
+        if let Some(x) = &self.data.table_route {
+            self.opening_regions = self
+                .register_opening_regions(&self.context, &x.region_routes)
+                .map_err(BoxedError::new)
+                .context(ExternalSnafu {
+                    clean_poisons: false,
+                })?;
+        }
+
+        Ok(())
+    }
+
+    async fn execute(&mut self, ctx: &ProcedureContext) -> ProcedureResult<Status> {
+        let state = &self.data.state;
+
+        let _timer = metrics::METRIC_META_PROCEDURE_CREATE_TABLE
+            .with_label_values(&[state.as_ref()])
+            .start_timer();
+
+        match state {
+            CreateTableState::Prepare => self.on_prepare().await,
+            CreateTableState::DatanodeCreateRegions => self.on_datanode_create_regions().await,
+            CreateTableState::CreateMetadata => self.on_create_metadata(ctx.procedure_id).await,
+        }
+        .map_err(map_to_procedure_error)
+    }
+
+    fn dump(&self) -> ProcedureResult<String> {
+        serde_json::to_string(&self.data).context(ToJsonSnafu)
+    }
+
+    fn lock_key(&self) -> LockKey {
+        let table_ref = &self.data.table_ref();
+
+        LockKey::new(vec![
+            CatalogLock::Read(table_ref.catalog).into(),
+            SchemaLock::read(table_ref.catalog, table_ref.schema).into(),
+            TableNameLock::new(table_ref.catalog, table_ref.schema, table_ref.table).into(),
+        ])
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
 pub enum CreateTableState {
    /// Prepares to create the table
@@ -444,6 +345,16 @@ pub struct CreateTableData {
 }

 impl CreateTableData {
+    pub fn new(task: CreateTableTask) -> Self {
+        CreateTableData {
+            state: CreateTableState::Prepare,
+            column_metadatas: vec![],
+            task,
+            table_route: None,
+            region_wal_options: None,
+        }
+    }
+
    fn table_ref(&self) -> TableReference<'_> {
        self.task.table_ref()
    }
--- a/src/common/meta/src/ddl/create_table/executor.rs
+++ b/src/common/meta/src/ddl/create_table/executor.rs
@@ -0,0 +1,203 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use api::v1::region::region_request::Body as PbRegionRequest;
+use api::v1::region::{RegionRequest, RegionRequestHeader};
+use common_telemetry::tracing_context::TracingContext;
+use common_telemetry::warn;
+use futures::future::join_all;
+use snafu::ensure;
+use store_api::metadata::ColumnMetadata;
+use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY;
+use store_api::storage::{RegionId, RegionNumber};
+use table::metadata::{RawTableInfo, TableId};
+use table::table_name::TableName;
+
+use crate::ddl::utils::raw_table_info::update_table_info_column_ids;
+use crate::ddl::utils::{
+    add_peer_context_if_needed, convert_region_routes_to_detecting_regions,
+    extract_column_metadatas, region_storage_path,
+};
+use crate::ddl::{CreateRequestBuilder, RegionFailureDetectorControllerRef};
+use crate::error::{self, Result};
+use crate::key::TableMetadataManagerRef;
+use crate::key::table_name::TableNameKey;
+use crate::key::table_route::{PhysicalTableRouteValue, TableRouteValue};
+use crate::node_manager::NodeManagerRef;
+use crate::rpc::router::{RegionRoute, find_leader_regions, find_leaders};
+
+/// [CreateTableExecutor] performs:
+/// - Creates the metadata of the table.
+/// - Creates the regions on the Datanode nodes.
+pub struct CreateTableExecutor {
+    create_if_not_exists: bool,
+    table_name: TableName,
+    builder: CreateRequestBuilder,
+}
+
+impl CreateTableExecutor {
+    /// Creates a new [`CreateTableExecutor`].
+    pub fn new(
+        table_name: TableName,
+        create_if_not_exists: bool,
+        builder: CreateRequestBuilder,
+    ) -> Self {
+        Self {
+            create_if_not_exists,
+            table_name,
+            builder,
+        }
+    }
+
+    /// On the prepare step, it performs:
+    /// - Checks whether the table exists.
+    /// - Returns the table id if the table exists.
+    ///
+    /// Abort(non-retry):
+    /// - Table exists and `create_if_not_exists` is `false`.
+    /// - Failed to get the table name value.
+    pub async fn on_prepare(
+        &self,
+        table_metadata_manager: &TableMetadataManagerRef,
+    ) -> Result<Option<TableId>> {
+        let table_name_value = table_metadata_manager
+            .table_name_manager()
+            .get(TableNameKey::new(
+                &self.table_name.catalog_name,
+                &self.table_name.schema_name,
+                &self.table_name.table_name,
+            ))
+            .await?;
+
+        if let Some(value) = table_name_value {
+            ensure!(
+                self.create_if_not_exists,
+                error::TableAlreadyExistsSnafu {
+                    table_name: self.table_name.to_string(),
+                }
+            );
+
+            return Ok(Some(value.table_id()));
+        }
+
+        Ok(None)
+    }
+
+    pub async fn on_create_regions(
+        &self,
+        node_manager: &NodeManagerRef,
+        table_id: TableId,
+        region_routes: &[RegionRoute],
+        region_wal_options: &HashMap<RegionNumber, String>,
+    ) -> Result<Vec<ColumnMetadata>> {
+        let storage_path =
+            region_storage_path(&self.table_name.catalog_name, &self.table_name.schema_name);
+        let leaders = find_leaders(region_routes);
+        let mut create_region_tasks = Vec::with_capacity(leaders.len());
+        let partition_exprs = region_routes
+            .iter()
+            .map(|r| (r.region.id.region_number(), r.region.partition_expr()))
+            .collect::<HashMap<_, _>>();
+
+        for datanode in leaders {
+            let requester = node_manager.datanode(&datanode).await;
+
+            let regions = find_leader_regions(region_routes, &datanode);
+            let mut requests = Vec::with_capacity(regions.len());
+            for region_number in regions {
+                let region_id = RegionId::new(table_id, region_number);
+                let create_region_request = self.builder.build_one(
+                    region_id,
+                    storage_path.clone(),
+                    region_wal_options,
+                    &partition_exprs,
+                );
+                requests.push(PbRegionRequest::Create(create_region_request));
+            }
+
+            for request in requests {
+                let request = RegionRequest {
+                    header: Some(RegionRequestHeader {
+                        tracing_context: TracingContext::from_current_span().to_w3c(),
+                        ..Default::default()
+                    }),
+                    body: Some(request),
+                };
+
+                let datanode = datanode.clone();
+                let requester = requester.clone();
+                create_region_tasks.push(async move {
+                    requester
+                        .handle(request)
+                        .await
+                        .map_err(add_peer_context_if_needed(datanode))
+                });
+            }
+        }
+
+        let mut results = join_all(create_region_tasks)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<_>>>()?;
+
+        let column_metadatas = if let Some(column_metadatas) =
+            extract_column_metadatas(&mut results, TABLE_COLUMN_METADATA_EXTENSION_KEY)?
+        {
+            column_metadatas
+        } else {
+            warn!(
+                "creating table result doesn't contains extension key `{TABLE_COLUMN_METADATA_EXTENSION_KEY}`,leaving the table's column metadata unchanged"
+            );
+            vec![]
+        };
+
+        Ok(column_metadatas)
+    }
+
+    /// Creates table metadata
+    ///
+    /// Abort(non-retry):
+    /// - Failed to create table metadata.
+    pub async fn on_create_metadata(
+        &self,
+        table_metadata_manager: &TableMetadataManagerRef,
+        region_failure_detector_controller: &RegionFailureDetectorControllerRef,
+        mut raw_table_info: RawTableInfo,
+        column_metadatas: &[ColumnMetadata],
+        table_route: PhysicalTableRouteValue,
+        region_wal_options: HashMap<RegionNumber, String>,
+    ) -> Result<()> {
+        if !column_metadatas.is_empty() {
+            update_table_info_column_ids(&mut raw_table_info, column_metadatas);
+        }
+        let detecting_regions =
+            convert_region_routes_to_detecting_regions(&table_route.region_routes);
+        let table_route = TableRouteValue::Physical(table_route);
+        table_metadata_manager
+            .create_table_metadata(raw_table_info, table_route, region_wal_options)
+            .await?;
+        region_failure_detector_controller
+            .register_failure_detectors(detecting_regions)
+            .await;
+
+        Ok(())
+    }
+
+    /// Returns the builder of the executor.
+    pub fn builder(&self) -> &CreateRequestBuilder {
+        &self.builder
+    }
+}
--- a/src/common/meta/src/ddl/create_table/template.rs
+++ b/src/common/meta/src/ddl/create_table/template.rs
@@ -20,19 +20,17 @@ use api::v1::region::{CreateRequest, RegionColumnDef};
 use api::v1::{ColumnDef, CreateTableExpr, SemanticType};
 use common_telemetry::warn;
 use snafu::{OptionExt, ResultExt};
-use store_api::metric_engine_consts::{LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME};
+use store_api::metric_engine_consts::LOGICAL_TABLE_METADATA_KEY;
 use store_api::storage::{RegionId, RegionNumber};
 use table::metadata::{RawTableInfo, TableId};

 use crate::error::{self, Result};
-use crate::wal_options_allocator::prepare_wal_options;
+use crate::wal_provider::prepare_wal_options;

-/// Builds a [CreateRequest] from a [RawTableInfo].
+/// Constructs a [CreateRequest] based on the provided [RawTableInfo].
 ///
-/// Note: **This method is only used for creating logical tables.**
-pub(crate) fn build_template_from_raw_table_info(
-    raw_table_info: &RawTableInfo,
-) -> Result<CreateRequest> {
+/// Note: This function is primarily intended for creating logical tables or allocating placeholder regions.
+pub fn build_template_from_raw_table_info(raw_table_info: &RawTableInfo) -> Result<CreateRequest> {
    let primary_key_indices = &raw_table_info.meta.primary_key_indices;
    let column_defs = raw_table_info
        .meta
@@ -57,7 +55,7 @@ pub(crate) fn build_template_from_raw_table_info(
    let options = HashMap::from(&raw_table_info.meta.options);
    let template = CreateRequest {
        region_id: 0,
-        engine: METRIC_ENGINE_NAME.to_string(),
+        engine: raw_table_info.meta.engine.clone(),
        column_defs,
        primary_key: primary_key_indices.iter().map(|i| *i as u32).collect(),
        path: String::new(),
@@ -138,7 +136,7 @@ pub struct CreateRequestBuilder {
 }

 impl CreateRequestBuilder {
-    pub(crate) fn new(template: CreateRequest, physical_table_id: Option<TableId>) -> Self {
+    pub fn new(template: CreateRequest, physical_table_id: Option<TableId>) -> Self {
        Self {
            template,
            physical_table_id,
--- a/src/common/meta/src/ddl/drop_database/executor.rs
+++ b/src/common/meta/src/ddl/drop_database/executor.rs
@@ -120,7 +120,13 @@ impl State for DropDatabaseExecutor {
            .await?;
        executor.invalidate_table_cache(ddl_ctx).await?;
        executor
-            .on_drop_regions(ddl_ctx, &self.physical_region_routes, true)
+            .on_drop_regions(
+                &ddl_ctx.node_manager,
+                &ddl_ctx.leader_region_registry,
+                &self.physical_region_routes,
+                true,
+                false,
+            )
            .await?;
        info!("Table: {}({}) is dropped", self.table_name, self.table_id);

--- a/src/common/meta/src/ddl/drop_table.rs
+++ b/src/common/meta/src/ddl/drop_table.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-pub(crate) mod executor;
+pub mod executor;
 mod metadata;

 use std::collections::HashMap;
@@ -156,7 +156,13 @@ impl DropTableProcedure {

    pub async fn on_datanode_drop_regions(&mut self) -> Result<Status> {
        self.executor
-            .on_drop_regions(&self.context, &self.data.physical_region_routes, false)
+            .on_drop_regions(
+                &self.context.node_manager,
+                &self.context.leader_region_registry,
+                &self.data.physical_region_routes,
+                false,
+                false,
+            )
            .await?;
        self.data.state = DropTableState::DeleteTombstone;
        Ok(Status::executing(true))
--- a/Show More
+++ b/Show More