feat: support tls for pg backend (#6611 )

* load tls Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl tls Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * pass options Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement require mode Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update config Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * default to prefer Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update example config Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * adjust example config Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * handle client cert and key properly Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement verify_ca and verify_full Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update integration test for config api Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * change config name and default mode Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
chore: add limit in resources panel and Cache Miss panel (#6636 )
2025-12-22 22:20:02 +00:00 · 2025-08-04 00:41:08 +00:00 · 2025-08-03 19:09:32 +00:00 · 2025-08-03 19:08:36 +00:00 · 2025-08-03 07:09:44 +00:00 · 2025-08-02 12:13:42 +00:00
588 changed files with 37646 additions and 12794 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -12,3 +12,6 @@ fetch = true
 checkout = true
 list_files = true
 internal_use_git2 = false
+
+[env]
+CARGO_WORKSPACE_DIR = { value = "", relative = true }
--- a/.gitignore
+++ b/.gitignore
@@ -60,4 +60,7 @@ tests-fuzz/corpus/
 greptimedb_data

 # github
-!/.github
+!/.github
+
+# Claude code
+CLAUDE.md
--- a/AUTHOR.md
+++ b/AUTHOR.md
@@ -10,12 +10,10 @@
 * [NiwakaDev](https://github.com/NiwakaDev)
 * [tisonkun](https://github.com/tisonkun)

-
 ## Team Members (in alphabetical order)

 * [apdong2022](https://github.com/apdong2022)
 * [beryl678](https://github.com/beryl678)
-* [Breeze-P](https://github.com/Breeze-P)
 * [daviderli614](https://github.com/daviderli614)
 * [discord9](https://github.com/discord9)
 * [evenyag](https://github.com/evenyag)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ members = [
    "src/common/datasource",
    "src/common/decimal",
    "src/common/error",
+    "src/common/event-recorder",
    "src/common/frontend",
    "src/common/function",
    "src/common/greptimedb-telemetry",
@@ -139,7 +140,7 @@ etcd-client = "0.14"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "ceb1af4fa9309ce65bda0367db7b384df2bb4d4f" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "fe8c13f5f3c1fbef63f57fbdd29f0490dfeb987b" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -167,14 +168,13 @@ opentelemetry-proto = { version = "0.27", features = [
    "with-serde",
    "logs",
 ] }
+ordered-float = { version = "4.3", features = ["serde"] }
 parking_lot = "0.12"
 parquet = { version = "54.2", default-features = false, features = ["arrow", "async", "object_store"] }
 paste = "1.0"
 pin-project = "1.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { git = "https://github.com/GreptimeTeam/promql-parser.git", rev = "0410e8b459dda7cb222ce9596f8bf3971bd07bd2", features = [
-    "ser",
-] }
+promql-parser = { version = "0.6", features = ["ser"] }
 prost = { version = "0.13", features = ["no-recursion-limit"] }
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.9"
@@ -225,10 +225,13 @@ tokio-util = { version = "0.7", features = ["io-util", "compat"] }
 toml = "0.8.8"
 tonic = { version = "0.12", features = ["tls", "gzip", "zstd"] }
 tower = "0.5"
+tower-http = "0.6"
+tracing = "0.1"
 tracing-appender = "0.2"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
 typetag = "0.2"
 uuid = { version = "1.7", features = ["serde", "v4", "fast-rng"] }
+vrl = "0.25"
 zstd = "0.13"
 # DO_NOT_REMOVE_THIS: END_OF_EXTERNAL_DEPENDENCIES

@@ -246,6 +249,7 @@ common-config = { path = "src/common/config" }
 common-datasource = { path = "src/common/datasource" }
 common-decimal = { path = "src/common/decimal" }
 common-error = { path = "src/common/error" }
+common-event-recorder = { path = "src/common/event-recorder" }
 common-frontend = { path = "src/common/frontend" }
 common-function = { path = "src/common/function" }
 common-greptimedb-telemetry = { path = "src/common/greptimedb-telemetry" }
--- a/config/config.md
+++ b/config/config.md
@@ -207,6 +207,8 @@
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
+| `memory` | -- | -- | The memory options. |
+| `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |


 ## Distributed Mode
@@ -281,6 +283,7 @@
 | `meta_client.metadata_cache_tti` | String | `5m` | -- |
 | `query` | -- | -- | The query engine options. |
 | `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
+| `query.allow_query_fallback` | Bool | `false` | Whether to allow query fallback when push down optimize fails.<br/>Default to false, meaning when push down optimize failed, return error msg |
 | `datanode` | -- | -- | Datanode options. |
 | `datanode.client` | -- | -- | Datanode client options. |
 | `datanode.client.connect_timeout` | String | `10s` | -- |
@@ -310,6 +313,8 @@
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
+| `memory` | -- | -- | The memory options. |
+| `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |


 ### Metasrv
@@ -332,6 +337,12 @@
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
+| `backend_tls` | -- | -- | TLS configuration for kv store backend (only applicable for PostgreSQL/MySQL backends)<br/>When using PostgreSQL or MySQL as metadata store, you can configure TLS here |
+| `backend_tls.mode` | String | `prefer` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- "disable" - No TLS<br/>- "prefer" (default) - Try TLS, fallback to plain<br/>- "require" - Require TLS<br/>- "verify_ca" - Require TLS and verify CA<br/>- "verify_full" - Require TLS and verify hostname |
+| `backend_tls.cert_path` | String | `""` | Path to client certificate file (for client authentication)<br/>Like "/path/to/client.crt" |
+| `backend_tls.key_path` | String | `""` | Path to client private key file (for client authentication)<br/>Like "/path/to/client.key" |
+| `backend_tls.ca_cert_path` | String | `""` | Path to CA certificate file (for server certificate verification)<br/>Required when using custom CAs or self-signed certificates<br/>Leave empty to use system root certificates only<br/>Like "/path/to/ca.crt" |
+| `backend_tls.watch` | Bool | `false` | Watch for certificate file changes and auto reload |
 | `grpc` | -- | -- | The gRPC server options. |
 | `grpc.bind_addr` | String | `127.0.0.1:3002` | The address to bind the gRPC server. |
 | `grpc.server_addr` | String | `127.0.0.1:3002` | The communication server address for the frontend and datanode to connect to metasrv.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `bind_addr`. |
@@ -388,6 +399,8 @@
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
+| `memory` | -- | -- | The memory options. |
+| `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |


 ### Datanode
@@ -553,6 +566,8 @@
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
+| `memory` | -- | -- | The memory options. |
+| `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |


 ### Flownode
@@ -562,6 +577,16 @@
 | `node_id` | Integer | Unset | The flownode identifier and should be unique in the cluster. |
 | `flow` | -- | -- | flow engine options. |
 | `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
+| `flow.batching_mode` | -- | -- | -- |
+| `flow.batching_mode.query_timeout` | String | `600s` | The default batching engine query timeout is 10 minutes. |
+| `flow.batching_mode.slow_query_threshold` | String | `60s` | will output a warn log for any query that runs for more that this threshold |
+| `flow.batching_mode.experimental_min_refresh_duration` | String | `5s` | The minimum duration between two queries execution by batching mode task |
+| `flow.batching_mode.grpc_conn_timeout` | String | `5s` | The gRPC connection timeout |
+| `flow.batching_mode.experimental_grpc_max_retries` | Integer | `3` | The gRPC max retry number |
+| `flow.batching_mode.experimental_frontend_scan_timeout` | String | `30s` | Flow wait for available frontend timeout,<br/>if failed to find available frontend after frontend_scan_timeout elapsed, return error<br/>which prevent flownode from starting |
+| `flow.batching_mode.experimental_frontend_activity_timeout` | String | `60s` | Frontend activity timeout<br/>if frontend is down(not sending heartbeat) for more than frontend_activity_timeout,<br/>it will be removed from the list that flownode use to connect |
+| `flow.batching_mode.experimental_max_filter_num_per_query` | Integer | `20` | Maximum number of filters allowed in a single query |
+| `flow.batching_mode.experimental_time_window_merge_threshold` | Integer | `3` | Time window merge distance |
 | `grpc` | -- | -- | The gRPC server options. |
 | `grpc.bind_addr` | String | `127.0.0.1:6800` | The address to bind the gRPC server. |
 | `grpc.server_addr` | String | `127.0.0.1:6800` | The address advertised to the metasrv,<br/>and used for connections from outside the host |
@@ -600,3 +625,5 @@
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
 | `query` | -- | -- | -- |
 | `query.parallelism` | Integer | `1` | Parallelism of the query engine for query sent by flownode.<br/>Default to 1, so it won't use too much cpu or memory |
+| `memory` | -- | -- | The memory options. |
+| `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -669,3 +669,11 @@ headers = { }
 ## The tokio console address.
 ## @toml2docs:none-default
 #+ tokio_console_addr = "127.0.0.1"
+
+## The memory options.
+[memory]
+## Whether to enable heap profiling activation during startup.
+## When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable
+## is set to "prof:true,prof_active:false". The official image adds this env variable.
+## Default is true.
+enable_heap_profiling = true
--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -7,6 +7,29 @@ node_id = 14
 ## The number of flow worker in flownode.
 ## Not setting(or set to 0) this value will use the number of CPU cores divided by 2.
 #+num_workers=0
+[flow.batching_mode]
+## The default batching engine query timeout is 10 minutes.
+#+query_timeout="600s"
+## will output a warn log for any query that runs for more that this threshold
+#+slow_query_threshold="60s"
+## The minimum duration between two queries execution by batching mode task
+#+experimental_min_refresh_duration="5s"
+## The gRPC connection timeout
+#+grpc_conn_timeout="5s"
+## The gRPC max retry number
+#+experimental_grpc_max_retries=3
+## Flow wait for available frontend timeout,
+## if failed to find available frontend after frontend_scan_timeout elapsed, return error
+## which prevent flownode from starting
+#+experimental_frontend_scan_timeout="30s"
+## Frontend activity timeout
+## if frontend is down(not sending heartbeat) for more than frontend_activity_timeout,
+## it will be removed from the list that flownode use to connect
+#+experimental_frontend_activity_timeout="60s"
+## Maximum number of filters allowed in a single query
+#+experimental_max_filter_num_per_query=20
+## Time window merge distance
+#+experimental_time_window_merge_threshold=3

 ## The gRPC server options.
 [grpc]
@@ -113,3 +136,11 @@ default_ratio = 1.0
 ## Parallelism of the query engine for query sent by flownode.
 ## Default to 1, so it won't use too much cpu or memory
 parallelism = 1
+
+## The memory options.
+[memory]
+## Whether to enable heap profiling activation during startup.
+## When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable
+## is set to "prof:true,prof_active:false". The official image adds this env variable.
+## Default is true.
+enable_heap_profiling = true
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -197,6 +197,9 @@ metadata_cache_tti = "5m"
 ## Parallelism of the query engine.
 ## Default to 0, which means the number of CPU cores.
 parallelism = 0
+## Whether to allow query fallback when push down optimize fails.
+## Default to false, meaning when push down optimize failed, return error msg
+allow_query_fallback = false

 ## Datanode options.
 [datanode]
@@ -277,3 +280,11 @@ headers = { }
 ## The tokio console address.
 ## @toml2docs:none-default
 #+ tokio_console_addr = "127.0.0.1"
+
+## The memory options.
+[memory]
+## Whether to enable heap profiling activation during startup.
+## When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable
+## is set to "prof:true,prof_active:false". The official image adds this env variable.
+## Default is true.
+enable_heap_profiling = true
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -65,6 +65,34 @@ node_max_idle_time = "24hours"
 ## The number of threads to execute the runtime for global write operations.
 #+ compact_rt_size = 4

+## TLS configuration for kv store backend (only applicable for PostgreSQL/MySQL backends)
+## When using PostgreSQL or MySQL as metadata store, you can configure TLS here
+[backend_tls]
+## TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html
+## - "disable" - No TLS
+## - "prefer" (default) - Try TLS, fallback to plain
+## - "require" - Require TLS
+## - "verify_ca" - Require TLS and verify CA
+## - "verify_full" - Require TLS and verify hostname
+mode = "prefer"
+
+## Path to client certificate file (for client authentication)
+## Like "/path/to/client.crt"
+cert_path = ""
+
+## Path to client private key file (for client authentication)
+## Like "/path/to/client.key"
+key_path = ""
+
+## Path to CA certificate file (for server certificate verification)
+## Required when using custom CAs or self-signed certificates
+## Leave empty to use system root certificates only
+## Like "/path/to/ca.crt"
+ca_cert_path = ""
+
+## Watch for certificate file changes and auto reload
+watch = false
+
 ## The gRPC server options.
 [grpc]
 ## The address to bind the gRPC server.
@@ -265,3 +293,11 @@ headers = { }
 ## The tokio console address.
 ## @toml2docs:none-default
 #+ tokio_console_addr = "127.0.0.1"
+
+## The memory options.
+[memory]
+## Whether to enable heap profiling activation during startup.
+## When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable
+## is set to "prof:true,prof_active:false". The official image adds this env variable.
+## Default is true.
+enable_heap_profiling = true
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -783,3 +783,11 @@ headers = { }
 ## The tokio console address.
 ## @toml2docs:none-default
 #+ tokio_console_addr = "127.0.0.1"
+
+## The memory options.
+[memory]
+## Whether to enable heap profiling activation during startup.
+## When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable
+## is set to "prof:true,prof_active:false". The official image adds this env variable.
+## Default is true.
+enable_heap_profiling = true
--- a/docker/buildx/centos/Dockerfile
+++ b/docker/buildx/centos/Dockerfile
@@ -47,4 +47,6 @@ WORKDIR /greptime
 COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/
 ENV PATH /greptime/bin/:$PATH

+ENV MALLOC_CONF="prof:true,prof_active:false"
+
 ENTRYPOINT ["greptime"]
--- a/docker/buildx/ubuntu/Dockerfile
+++ b/docker/buildx/ubuntu/Dockerfile
@@ -47,4 +47,6 @@ WORKDIR /greptime
 COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/
 ENV PATH /greptime/bin/:$PATH

+ENV MALLOC_CONF="prof:true,prof_active:false"
+
 ENTRYPOINT ["greptime"]
--- a/docker/ci/centos/Dockerfile
+++ b/docker/ci/centos/Dockerfile
@@ -15,4 +15,6 @@ ADD $TARGETARCH/greptime /greptime/bin/

 ENV PATH /greptime/bin/:$PATH

+ENV MALLOC_CONF="prof:true,prof_active:false"
+
 ENTRYPOINT ["greptime"]
--- a/docker/ci/ubuntu/Dockerfile
+++ b/docker/ci/ubuntu/Dockerfile
@@ -18,4 +18,6 @@ ENV PATH /greptime/bin/:$PATH

 ENV TARGET_BIN=$TARGET_BIN

+ENV MALLOC_CONF="prof:true,prof_active:false"
+
 ENTRYPOINT ["sh", "-c", "exec $TARGET_BIN \"$@\"", "--"]
--- a/docs/how-to/how-to-profile-memory.md
+++ b/docs/how-to/how-to-profile-memory.md
@@ -30,6 +30,23 @@ curl https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph

 ## Profiling

+### Configuration
+
+You can control heap profiling activation through configuration. Add the following to your configuration file:
+
+```toml
+[memory]
+# Whether to enable heap profiling activation during startup.
+# When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable
+# is set to "prof:true,prof_active:false". The official image adds this env variable.
+# Default is true.
+enable_heap_profiling = true
+```
+
+By default, if you set `MALLOC_CONF=prof:true,prof_active:false`, the database will enable profiling during startup. You can disable this behavior by setting `enable_heap_profiling = false` in the configuration.
+
+### Starting with environment variables
+
 Start GreptimeDB instance with environment variables:

 ```bash
@@ -40,6 +57,23 @@ MALLOC_CONF=prof:true ./target/debug/greptime standalone start
 _RJEM_MALLOC_CONF=prof:true ./target/debug/greptime standalone start
 ```

+### Memory profiling control
+
+You can control heap profiling activation using the new HTTP APIs:
+
+```bash
+# Check current profiling status
+curl -X GET localhost:4000/debug/prof/mem/status
+
+# Activate heap profiling (if not already active)
+curl -X POST localhost:4000/debug/prof/mem/activate
+
+# Deactivate heap profiling
+curl -X POST localhost:4000/debug/prof/mem/deactivate
+```
+
+### Dump memory profiling data
+
 Dump memory profiling data through HTTP API:

 ```bash
--- a/docs/how-to/how-to-write-aggregate-function.md
+++ b/docs/how-to/how-to-write-aggregate-function.md
@@ -1,4 +1,4 @@
-Currently, our query engine is based on DataFusion, so all aggregate function is executed by DataFusion, through its UDAF interface. You can find DataFusion's UDAF example [here](https://github.com/apache/arrow-datafusion/blob/arrow2/datafusion-examples/examples/simple_udaf.rs). Basically, we provide the same way as DataFusion to write aggregate functions: both are centered in a struct called "Accumulator" to accumulates states along the way in aggregation.
+Currently, our query engine is based on DataFusion, so all aggregate function is executed by DataFusion, through its UDAF interface. You can find DataFusion's UDAF example [here](https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/simple_udaf.rs). Basically, we provide the same way as DataFusion to write aggregate functions: both are centered in a struct called "Accumulator" to accumulates states along the way in aggregation.

 However, DataFusion's UDAF implementation has a huge restriction, that it requires user to provide a concrete "Accumulator". Take `Median` aggregate function for example, to aggregate a `u32` datatype column, you have to write a `MedianU32`, and use `SELECT MEDIANU32(x)` in SQL. `MedianU32` cannot be used to aggregate a `i32` datatype column. Or, there's another way: you can use a special type that can hold all kinds of data (like our `Value` enum or Arrow's `ScalarValue`), and `match` all the way up to do aggregate calculations. It might work, though rather tedious. (But I think it's DataFusion's preferred way to write UDAF.)

--- a/docs/rfcs/2023-02-01-table-compaction.md
+++ b/docs/rfcs/2023-02-01-table-compaction.md
@@ -76,7 +76,7 @@ pub trait CompactionStrategy {
 ```

 The most suitable compaction strategy for time-series scenario would be 
-a hybrid strategy that combines time window compaction with size-tired compaction, just like [Cassandra](https://cassandra.apache.org/doc/latest/cassandra/operating/compaction/twcs.html) and [ScyllaDB](https://docs.scylladb.com/stable/architecture/compaction/compaction-strategies.html#time-window-compaction-strategy-twcs) does.
+a hybrid strategy that combines time window compaction with size-tired compaction, just like [Cassandra](https://cassandra.apache.org/doc/latest/cassandra/managing/operating/compaction/twcs.html) and [ScyllaDB](https://docs.scylladb.com/stable/architecture/compaction/compaction-strategies.html#time-window-compaction-strategy-twcs) does.

 We can first group SSTs in level n into buckets according to some predefined time window. Within that window, 
 SSTs are compacted in a size-tired manner (find SSTs with similar size and compact them to level n+1). 
--- a/docs/rfcs/2024-01-17-dataflow-framework.md
+++ b/docs/rfcs/2024-01-17-dataflow-framework.md
@@ -28,7 +28,7 @@ In order to do those things while maintaining a low memory footprint, you need t
 - Greptime Flow's is built on top of [Hydroflow](https://github.com/hydro-project/hydroflow).
 - We have three choices for the Dataflow/Streaming process framework for our simple continuous aggregation feature:
 1. Based on the timely/differential dataflow crate that [materialize](https://github.com/MaterializeInc/materialize) based on. Later, it's proved too obscure for a simple usage, and is hard to customize memory usage control.
-2. Based on a simple dataflow framework that we write from ground up, like what [arroyo](https://www.arroyo.dev/) or [risingwave](https://www.risingwave.dev/) did, for example the core streaming logic of [arroyo](https://github.com/ArroyoSystems/arroyo/blob/master/arroyo-datastream/src/lib.rs) only takes up to 2000 line of codes. However, it means maintaining another layer of dataflow framework, which might seem easy in the beginning, but I fear it might be too burdensome to maintain once we need more features.
+2. Based on a simple dataflow framework that we write from ground up, like what [arroyo](https://www.arroyo.dev/) or [risingwave](https://www.risingwave.dev/) did, for example the core streaming logic of [arroyo](https://github.com/ArroyoSystems/arroyo/blob/master/crates/arroyo-datastream/src/lib.rs) only takes up to 2000 line of codes. However, it means maintaining another layer of dataflow framework, which might seem easy in the beginning, but I fear it might be too burdensome to maintain once we need more features.
 3. Based on a simple and lower level dataflow framework that someone else write, like [hydroflow](https://github.com/hydro-project/hydroflow), this approach combines the best of both worlds. Firstly, it boasts ease of comprehension and customization. Secondly, the dataflow framework offers precisely the necessary features for crafting uncomplicated single-node dataflow programs while delivering decent performance.

 Hence, we choose the third option, and use a simple logical plan that's anagonistic to the underlying dataflow framework, as it only describe how the dataflow graph should be doing, not how it do that. And we built operator in hydroflow to execute the plan. And the result hydroflow graph is wrapped in a engine that only support data in/out and tick event to flush and compute the result. This provide a thin middle layer that's easy to maintain and allow switching to other dataflow framework if necessary.
--- a/docs/rfcs/2025-06-20-repartition.md
+++ b/docs/rfcs/2025-06-20-repartition.md
@@ -0,0 +1,154 @@
+---
+Feature Name: Repartition
+Tracking Issue: https://github.com/GreptimeTeam/greptimedb/issues/6558
+Date: 2025-06-20
+Author: "Ruihang Xia <waynestxia@gmail.com>"
+---
+
+# Summary
+
+This RFC proposes a method for repartitioning a table, to adjust the partition rule and data distribution.
+
+# Motivation
+
+With time passing, the data distribution and skew pattern of a table might change. We need a way to repartition the table to suit the new pattern.
+
+# Details
+
+Here is a rough workflow diagram of the entire repartition process, each step is described in detail below.
+
+```mermaid
+sequenceDiagram
+    participant Frontend
+    participant Metasrv
+    participant Datanodes
+    participant Region0 as Region 0
+    
+    Frontend->>Frontend: Process request, validation etc.
+    Frontend->>Metasrv: Submit procedure
+    Metasrv->>Metasrv: Compute diff and generate migration plan
+    Metasrv->>Metasrv: Allocate necessary region resources (with Paas)
+    Metasrv->>Datanodes: Stop compaction and snapshot
+    rect rgb(255, 225, 225)
+        note over Frontend, Region0: No Ingestion Period
+        Metasrv->>Frontend: Stop processing write requests
+        Metasrv->>Metasrv: Update metadata
+        Metasrv->>Frontend: Start processing read requests
+    end
+    Metasrv->>Datanodes: Update region rule, stage version changes from now on
+    Region0->>Region0: Compute new manifests for all regions
+    Region0->>Datanodes: Submit manifest changes
+    Metasrv->>Datanodes: Recover compaction and snapshot, make staged changes visible
+
+    note over Frontend, Datanodes: Reload Cache
+    Metasrv->>Metasrv: Release resources (with Paas)
+    Metasrv->>Metasrv: Schedule optional compaction (to remote compactor)
+```
+
+## Preprocessing
+
+This phase is for static analysis of the new partition rule. The server can know whether the repartitioning is possible, how to do the repartitioning, and how much resources are needed.
+
+In theory, the input and output partition rules for repartitioning can be completely unrelated. But in practice, to avoid a very large change set, we'll only allow two simple kinds of change. One splits one region into two regions (region split) and another merges two regions into one (region merge).
+
+After validating the new partition rule using the same validation logic as table creation, we compute the difference between the old and new partition rules. The resulting diff may contain several independent groups of changes. During subsequent processing, each group of changes can be handled independently and can succeed or fail without affecting other groups or creating non-idempotently retryable scenarios.
+
+Next, we generate a repartition plan for each group of changes. Each plan contains this information for all regions involved in that particular plan. And one target region will only be referenced by a single plan.
+
+With those plans, we can determine the resource requirements for the repartition operation, where resources here primarily refer to Regions. Metasrv will coordinate with PaaS layer to pre-allocate the necessary regions at this stage. These new regions start completely empty, and their metadata and manifests will be populated during subsequent modification steps.
+
+## Data Processing
+
+This phase is primarily for region's change, including region's metadata (route table and the corresponding rule) and manifest.
+
+Once we start processing one plan through a procedure, we'll first stop the region's compaction and snapshot. This is to avoid any states being removed due to compaction (which may removes old SST files) and snapshot (which may removes old manifest files).
+
+Metasrv will trying to update the metadata of partition, or the region route table (related to `PartitionRuleManager`). This step is in the "no ingestion" scope, so no new data will be ingested. Since this won't take much time, the affection to the cluster is minimized. Metasrv will also update the region rule to corresponding regions on Datanodes. 
+
+Every regions and all the ingestion requests to the region server will have a version of region rule, to identify under which rule the request is processed. The version can be something like `hash(region_rule)`. Once the region rule on region server is updated, all ingestion request with old rule will be rejected, and all requests with new rule will be accepted but not visible. They can still be flushed to persisted storage, but their version change (new manifest) will be staged.
+
+Then region 0 (or let metasrv to pick any operational region) will compute the new manifests for all target regions. This step is done by first reading all old manifests, and remapping the files with new partition rule, to get the content of new manifests. Notice this step only handles the manifests before region rule change on region server, and won't touch those staged manifests, as they are already with the new rule.
+
+Those new manifest will be submitted to the corresponding target regions by region 0 via a `RegionEdit` request. If this request falls after a few retries, region 0 will try to rollback this change by directly overwriting the manifest on object storage. and report this failure to metasrv and let the entire repartition procedure to fail. And we can also optionally compute the new manifest for those staged version changes (like another repartition) and submit them to the target regions to make the also visible even if the repartition fails.
+
+In the other hand, a successful `RegionEdit` request also acknowledges those staged version changes and make them visible.
+
+After this step, the repartition is done in the data plane. We can start to process compaction and snapshot again.
+
+## Postprocessing
+
+After the main processing is done, we can do some extra postprocessing to reduce the performance impact of repartition. Including reloading caches in frontend's route table, metasrv's kv cache and datanode's read/write/page cache etc.
+
+We can also schedule an optional compaction to reorganize all the data file under the new partition rule to reduce potential fragmentation or read amplification.
+
+## Procedure
+
+Here describe the repartition procedure step by step:
+
+- <on frontend> Validating repartition request
+- <on frontend> Initialize the repartition procedure
+- Calculate rule diff and repartition plan group
+- Allocate necessary new regions
+- Lock the table key
+- For each repartition subprocedure
+  - Stop compaction and snapshot
+  - Forbid new ingestion requests, update metadata, allow ingestion requests.
+  - Update region rule to regions
+  - Pick one region to calculate new manifest for all regions in this repartition group
+  - Let that region to apply new manifest to each region via `RegionEdit`
+  - If failed after some retries, revert this manifest change to other succeeded regions and mark this failure.
+  - If all succeeded, acknowledge those staged version changes and make them visible.
+  - Return result
+- Collect results from subprocedure.
+  - For those who failed, we need to restart those regions to force reconstruct their status from manifests
+  - For those who succeeded, collect and merge their rule diff
+- Unlock the table key
+- Report the result to user.
+- <in background> Reload cache
+- <in background> Maybe trigger a special compaction
+
+In addition of sequential step, rollback is also an important part of this procedure. There are three steps can be rolled back when unrecoverable failure occurs.
+
+If the metadata update is not committed, we can overwrite the metadata to previous version. This step is scoped in the "no ingestion" period, so no new data will be ingested and the status of both datanode and metasrv will be consistent.
+
+If the `RegionEdit` to other regions is not acknowledged, or partial acknowledged, we can directly overwrite the manifest on object storage from the central region (who computes the new manifest), and force region server to reload corresponding region to load its state from object storage to recover.
+
+If the staged version changes are not acknowledged, we can re-compute manifest based on old rule for staged data, and apply them directly like above. This is like another smaller repartition for those staged data.
+
+## Region rule validation and diff calculation
+
+In the current codebase, the rule checker is not complete. It can't check uniqueness and completeness of the rule. This RFC also propose a new way to validate the rule.
+
+The proposed validation way is based on a check-point system, which first generates a group of check-points from the rule, and then check if all the point is covered and only covered by one rule.
+
+All the partition rule expressionis limited to be the form of `<column> <operator> <value>`, and the operator is limited to be comparison operators. Those expressions are allowed to be nested with `AND` and `OR` operators. Based on this, we can first extract all the unique values on each column, adding and subtracting a little epsilon to cover its left and right boundary.
+
+Since we accept integer, float and string as the value type, compute on them directly is not convenient. So we'll first normalize them to a common type and only need to preserve the relative partial ordering. This also avoids the problem of "what is next/previous value" of string and "what's a good precision" for float.
+
+After normalization, we get a set of scatter points for each column. Then we can generate a set of check-points by combining all the scatter points like building a cartesian product. This might bring a large number of check-points, so we can do an prune optimization to remove some of them by merging some of the expression zones. Those expressions who have identical N-1 edge sub-expressions with one adjacent edge can be merged together. This prune check is with a time complexity of O(N * M * log(M)), where N is the number of active dimensions and M is the number of expression zones. Diff calculation is also done by finding different expression zones between the old and new rule set, and check if we can transform one to another by merging some of the expression zones.
+
+The step to validate the check-points set against expressions can be treated as a tiny expression of `PhysicalExpr`. This evaluation will give a boolean matrix of K*M shape, where K is the number of check-points. We then check in each row of the matrix, if there is one and only one true value.
+
+## Compute and use new manifest
+
+We can generate a new set of manifest file based on old manifest and two versions of rule. From abvoe rule processing part, we can tell how a new rule & region is from previous one. So a simple way to get the new manifest is also apply the step of change to manifest files. E.g., if region A is from region B and C, we simply combine all file IDs from B and C to generate the content of A.
+
+If necessary, we can do this better by involving some metadata related to data, like min-max statistics of each file, and pre-evaluate over min-max to filter out unneeded files when generating new manifest.
+
+The way to use new manifest needs one more extra step based on the current implementation. We'll need to record either in manifest or in file metadata, of what rule is used when generating (flush or compaction) a SST file. Then in every single read request, we need to append the current region rule as predicate to the read request, to ensure no data belong to other regions will be read. We can use the stored region rule to reduce the number of new predicates to apply, by removing the identical predicate between the current region rule and the stored region rule. So ideally in a table that has not been repartitioned recently, the overhead of checking region rule is minimal.
+
+## Pre-required tasks
+
+In above steps, we assume some functionalities are implemented. Here list them with where they are used and how to implement them.
+
+### Cross-region read
+
+The current data directory structure is `{table_id}/{region_id}/[data/metadata]/{file_id}`, every region can only access files under their own directory. After repartition, data file may be placed in other previous old regions. So we need to support cross-region read. This new access method allows region to access any file under the same table. Related tracking issue is <https://github.com/GreptimeTeam/greptimedb/issues/6409>.
+
+### Global GC worker
+
+This is to simplify state management of data files. As one file may be referenced in multiple manifests, or no manifest at all. After this, every region and the repartition process only need to care about generateing and using new files, without tracking whether a file should be deleted or not. Leaving the deletion to the global GC worker. This worker basically works by counting reference from manifest file, and remove unused one. Related tracking issue is **TBD**.
+
+# Alternatives
+
+In the "Data Processing" section, we can enlarge the "no ingestion" period to include almost all the steps. This can simplify the entire procedure by a lot, but will bring a longer time of ingestion pause which may not be acceptable.
--- a/docs/rfcs/2025-07-04-compatibility-test-framework.md
+++ b/docs/rfcs/2025-07-04-compatibility-test-framework.md
@@ -0,0 +1,151 @@
+---
+Feature Name: Compatibility Test Framework
+Tracking Issue: TBD
+Date: 2025-07-04
+Author: "Ruihang Xia <waynestxia@gmail.com>"
+---
+
+# Summary
+
+This RFC proposes a compatibility test framework for GreptimeDB to ensure backward/forward compatibility for different versions of GreptimeDB.
+
+# Motivation
+
+In current practice, we don't have a systematic way to test and ensure the compatibility of different versions of GreptimeDB. Each time we release a new version, we need to manually test the compatibility with ad-hoc cases. This is not only time-consuming, but also prone to errors and unmaintainable. Highly rely on the release manager to ensure the compatibility of different versions of GreptimeDB.
+
+We don't have a detailed guide on the release SoP of how to test and ensure the compatibility of the new version. And has broken the compatibility of the new version many times (`v0.14.1` and `v0.15.1` are two examples, which are both released right after the major release).
+
+# Details
+
+This RFC proposes a compatibility test framework that is easy to maintain, extend and run. It can tell the compatibility between any given two versions of GreptimeDB, both backward and forward. It's based on the Sqlness library but used in a different way.
+
+Generally speaking, the framework is composed of two parts:
+
+1. Test cases: A set of test cases that are maintained dedicatedly for the compatibility test. Still in the `.sql` and `.result` format.
+2. Test framework: A new sqlness runner that is used to run the test cases. With some new features that is not required by the integration sqlness test.
+
+## Test Cases
+
+### Structure
+
+The case set is organized in three parts:
+
+- `1.feature`: Use a new feature
+- `2.verify`: Verify database behavior
+- `3.cleanup`: Paired with `1.feature`, cleanup the test environment.
+
+These three parts are organized in a tree structure, and should be run in sequence:
+
+```
+compatibility_test/
+├── 1.feature/
+│   ├── feature-a/
+│   ├── feature-b/
+│   └── feature-c/
+├── 2.verify/
+│   ├── verify-metadata/
+│   ├── verify-data/
+│   └── verify-schema/
+└── 3.cleanup/
+    ├── cleanup-a/
+    ├── cleanup-b/
+    └── cleanup-c/
+```
+
+### Example
+
+For example, for a new feature like adding new index option ([#6416](https://github.com/GreptimeTeam/greptimedb/pull/6416)), we (who implement the feature) create a new test case like this:
+
+```sql
+-- path: compatibility_test/1.feature/index-option/granularity_and_false_positive_rate.sql
+
+-- SQLNESS ARG since=0.15.0
+-- SQLNESS IGNORE_RESULT
+CREATE TABLE granularity_and_false_positive_rate (ts timestamp time index, val double) with ("index.granularity" = "8192", "index.false_positive_rate" = "0.01");
+```
+
+And
+
+```sql
+-- path: compatibility_test/3.cleanup/index-option/granularity_and_false_positive_rate.sql
+drop table granularity_and_false_positive_rate;
+```
+
+Since this new feature don't require some special way to verify the database behavior, we can reuse existing test cases in `2.verify/` to verify the database behavior. For example, we can reuse the `verify-metadata` test case to verify the metadata of the table.
+
+```sql
+-- path: compatibility_test/2.verify/verify-metadata/show-create-table.sql
+
+-- SQLNESS TEMPLATE TABLE="SHOW TABLES";
+SHOW CREATE TABLE $TABLE;
+```
+
+In this example, we use some new sqlness features that will be introduced in the next section (`since`, `IGNORE_RESULT`, `TEMPLATE`).
+
+### Maintenance
+
+Each time implement a new feature that should be covered by the compatibility test, we should create a new test case in `1.feature/` and `3.cleanup/` for them. And check if existing cases in `2.verify/` can be reused to verify the database behavior.
+
+This simulates an enthusiastic user who uses all the new features at the first time. All the new Maintenance burden is on the feature implementer to write one more test case for the new feature, to "fixation" the behavior. And once there is a breaking change in the future, it can be detected by the compatibility test framework automatically.
+
+Another topic is about deprecation. If a feature is deprecated, we should also mark it in the test case. Still use above example, assume we deprecate the `index.granularity` and `index.false_positive_rate` index options in `v0.99.0`, we can mark them as:
+```sql
+-- SQLNESS ARG since=0.15.0 till=0.99.0
+...
+```
+
+This tells the framework to ignore this feature in version `v0.99.0` and later. Currently, we have so many experimental features that are scheduled to be broken in the future, this is a good way to mark them.
+
+## Test Framework
+
+This section is about new sqlness features required by this framework.
+
+### Since and Till
+
+Follows the `ARG` interceptor in sqlness, we can mark a feature is available between two given versions. Only the `since` is required:
+
+```sql
+-- SQLNESS ARG since=VERSION_STRING [till=VERSION_STRING]
+```
+
+### IGNORE_RESULT
+
+`IGNORE_RESULT` is a new interceptor, it tells the runner to ignore the result of the query, only check whether the query is executed successfully.
+
+This is useful to reduce the Maintenance burden of the test cases, unlike the integration sqlness test, in most cases we don't care about the result of the query, only need to make sure the query is executed successfully.
+
+### TEMPLATE
+
+`TEMPLATE` is another new interceptor, it can generate queries from a template based on a runtime data.
+
+In above example, we need to run the `SHOW CREATE TABLE` query for all existing tables, so we can use the `TEMPLATE` interceptor to generate the query with a dynamic table list.
+
+### RUNNER
+
+There are also some extra requirement for the runner itself:
+
+- It should run the test cases in sequence, first `1.feature/`, then `2.verify/`, and finally `3.cleanup/`.
+- It should be able to fetch required version automatically to finish the test.
+- It should handle the `since` and `till` properly.
+
+On the `1.feature` phase, the runner needs to identify all features need to be tested by version number. And then restart with a new version (the `to` version) to run `2.verify/` and `3.cleanup/` phase.
+
+## Test Report
+
+Finally, we can run the compatibility test to verify the compatibility between any given two versions of GreptimeDB, for example:
+
+```bash
+# check backward compatibility between v0.15.0 and v0.16.0 when releasing v0.16.0
+./sqlness run --from=0.15.0 --to=0.16.0
+
+# check forward compatibility when downgrading from v0.15.0 to v0.13.0
+./sqlness run --from=0.15.0 --to=0.13.0
+```
+
+We can also use a script to run the compatibility test for all the versions in a given range to give a quick report with all versions we need.
+
+And we always bump the version in `Cargo.toml` to the next major release version, so the next major release version can be used as "latest" unpublished version for scenarios like local testing.
+
+# Alternatives
+
+There was a previous attempt to implement a compatibility test framework that was disabled due to some reasons [#3728](https://github.com/GreptimeTeam/greptimedb/issues/3728).
--- a/grafana/README.md
+++ b/grafana/README.md
@@ -83,7 +83,7 @@ If you use the [Helm Chart](https://github.com/GreptimeTeam/helm-charts) to depl
 - `monitoring.enabled=true`: Deploys a standalone GreptimeDB instance dedicated to monitoring the cluster;
 - `grafana.enabled=true`: Deploys Grafana and automatically imports the monitoring dashboard;

-The standalone GreptimeDB instance will collect metrics from your cluster, and the dashboard will be available in the Grafana UI. For detailed deployment instructions, please refer to our [Kubernetes deployment guide](https://docs.greptime.com/user-guide/deployments-administration-administration/deploy-on-kubernetes/getting-started).
+The standalone GreptimeDB instance will collect metrics from your cluster, and the dashboard will be available in the Grafana UI. For detailed deployment instructions, please refer to our [Kubernetes deployment guide](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/overview).

 ### Self-host Prometheus and import dashboards manually

--- a/grafana/dashboards/metrics/cluster/dashboard.json
+++ b/grafana/dashboards/metrics/cluster/dashboard.json
--- a/grafana/dashboards/metrics/cluster/dashboard.md
+++ b/grafana/dashboards/metrics/cluster/dashboard.md
@@ -21,14 +21,14 @@
 # Resources
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
-| Datanode Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$datanode"}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{instance}}]-[{{ pod }}]` |
-| Datanode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$datanode"}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
-| Frontend Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$frontend"}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{ instance }}]-[{{ pod }}]` |
-| Frontend CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$frontend"}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]-cpu` |
-| Metasrv Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$metasrv"}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{ instance }}]-[{{ pod }}]-resident` |
-| Metasrv CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$metasrv"}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
-| Flownode Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$flownode"}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{ instance }}]-[{{ pod }}]` |
-| Flownode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$flownode"}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
+| Datanode Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$datanode"}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-datanode"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{instance}}]-[{{ pod }}]` |
+| Datanode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$datanode"}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-datanode"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
+| Frontend Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$frontend"}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-frontend"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{ instance }}]-[{{ pod }}]` |
+| Frontend CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$frontend"}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-frontend"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]-cpu` |
+| Metasrv Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$metasrv"}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-metasrv"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{ instance }}]-[{{ pod }}]-resident` |
+| Metasrv CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$metasrv"}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-metasrv"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
+| Flownode Memory per Instance | `sum(process_resident_memory_bytes{instance=~"$flownode"}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-flownode"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{ instance }}]-[{{ pod }}]` |
+| Flownode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{instance=~"$flownode"}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-flownode"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
 # Frontend Requests
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
@@ -72,18 +72,19 @@
 | Region Worker Handle Bulk Insert Requests | `histogram_quantile(0.95, sum by(le,instance, stage, pod) (rate(greptime_region_worker_handle_write_bucket[$__rate_interval])))`<br/>`sum by(instance, stage, pod) (rate(greptime_region_worker_handle_write_sum[$__rate_interval]))/sum by(instance, stage, pod) (rate(greptime_region_worker_handle_write_count[$__rate_interval]))` | `timeseries` | Per-stage elapsed time for region worker to handle bulk insert region requests. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{stage}}]-P95` |
 | Active Series and Field Builders Count | `sum by(instance, pod) (greptime_mito_memtable_active_series_count)`<br/>`sum by(instance, pod) (greptime_mito_memtable_field_builder_count)` | `timeseries` | Compaction oinput output bytes | `prometheus` | `none` | `[{{instance}}]-[{{pod}}]-series` |
 | Region Worker Convert Requests | `histogram_quantile(0.95, sum by(le, instance, stage, pod) (rate(greptime_datanode_convert_region_request_bucket[$__rate_interval])))`<br/>`sum by(le,instance, stage, pod) (rate(greptime_datanode_convert_region_request_sum[$__rate_interval]))/sum by(le,instance, stage, pod) (rate(greptime_datanode_convert_region_request_count[$__rate_interval]))` | `timeseries` | Per-stage elapsed time for region worker to decode requests. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{stage}}]-P95` |
+| Cache Miss | `sum by (instance,pod, type) (rate(greptime_mito_cache_miss{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | The local cache miss of the datanode. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]` |
 # OpenDAL
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
 | QPS per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
-| Read QPS per Instance | `sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation="read"}[$__rate_interval]))` | `timeseries` | Read QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
-| Read P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode",operation="read"}[$__rate_interval])))` | `timeseries` | Read P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-{{scheme}}` |
-| Write QPS per Instance | `sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation="write"}[$__rate_interval]))` | `timeseries` | Write QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-{{scheme}}` |
-| Write P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation="write"}[$__rate_interval])))` | `timeseries` | Write P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
+| Read QPS per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation=~"read\|Reader::read"}[$__rate_interval]))` | `timeseries` | Read QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Read P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode",operation=~"read\|Reader::read"}[$__rate_interval])))` | `timeseries` | Read P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Write QPS per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation=~"write\|Writer::write\|Writer::close"}[$__rate_interval]))` | `timeseries` | Write QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Write P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation =~ "Writer::write\|Writer::close\|write"}[$__rate_interval])))` | `timeseries` | Write P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
 | List QPS per Instance | `sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation="list"}[$__rate_interval]))` | `timeseries` | List QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
 | List P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation="list"}[$__rate_interval])))` | `timeseries` | List P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
 | Other Requests per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode",operation!~"read\|write\|list\|stat"}[$__rate_interval]))` | `timeseries` | Other Requests per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
-| Other Request P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation!~"read\|write\|list"}[$__rate_interval])))` | `timeseries` | Other Request P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Other Request P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation!~"read\|write\|list\|Writer::write\|Writer::close\|Reader::read"}[$__rate_interval])))` | `timeseries` | Other Request P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
 | Opendal traffic | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_bytes_sum{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Total traffic as in bytes by instance and operation | `prometheus` | `decbytes` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
 | OpenDAL errors per Instance | `sum by(instance, pod, scheme, operation, error) (rate(opendal_operation_errors_total{instance=~"$datanode", error!="NotFound"}[$__rate_interval]))` | `timeseries` | OpenDAL error counts per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]-[{{error}}]` |
 # Metasrv
--- a/grafana/dashboards/metrics/cluster/dashboard.yaml
+++ b/grafana/dashboards/metrics/cluster/dashboard.yaml
@@ -180,13 +180,18 @@ groups:
        - title: Datanode Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{instance=~"$datanode"}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{instance}}]-[{{ pod }}]'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-datanode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Datanode CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -197,16 +202,26 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-datanode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Frontend Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{instance=~"$frontend"}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-frontend"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Frontend CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -217,16 +232,26 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]-cpu'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-frontend"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Metasrv Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{instance=~"$metasrv"}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]-resident'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-metasrv"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Metasrv CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -237,16 +262,26 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-metasrv"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Flownode Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{instance=~"$flownode"}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-flownode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Flownode CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -257,6 +292,11 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-flownode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
    - title: Frontend Requests
      panels:
        - title: HTTP QPS per Instance
@@ -642,6 +682,15 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-AVG'
+        - title: Cache Miss
+          type: timeseries
+          description: The local cache miss of the datanode.
+          queries:
+            - expr: sum by (instance,pod, type) (rate(greptime_mito_cache_miss{instance=~"$datanode"}[$__rate_interval]))
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]'
    - title: OpenDAL
      panels:
        - title: QPS per Instance
@@ -659,41 +708,41 @@ groups:
          description: Read QPS per Instance.
          unit: ops
          queries:
-            - expr: sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation="read"}[$__rate_interval]))
+            - expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation=~"read|Reader::read"}[$__rate_interval]))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: Read P99 per Instance
          type: timeseries
          description: Read P99 per Instance.
          unit: s
          queries:
-            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode",operation="read"}[$__rate_interval])))
+            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode",operation=~"read|Reader::read"}[$__rate_interval])))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-{{scheme}}'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: Write QPS per Instance
          type: timeseries
          description: Write QPS per Instance.
          unit: ops
          queries:
-            - expr: sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation="write"}[$__rate_interval]))
+            - expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation=~"write|Writer::write|Writer::close"}[$__rate_interval]))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-{{scheme}}'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: Write P99 per Instance
          type: timeseries
          description: Write P99 per Instance.
          unit: s
          queries:
-            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation="write"}[$__rate_interval])))
+            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation =~ "Writer::write|Writer::close|write"}[$__rate_interval])))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: List QPS per Instance
          type: timeseries
          description: List QPS per Instance.
@@ -729,7 +778,7 @@ groups:
          description: Other Request P99 per Instance.
          unit: s
          queries:
-            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation!~"read|write|list"}[$__rate_interval])))
+            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation!~"read|write|list|Writer::write|Writer::close|Reader::read"}[$__rate_interval])))
              datasource:
                type: prometheus
                uid: ${metrics}
--- a/grafana/dashboards/metrics/standalone/dashboard.json
+++ b/grafana/dashboards/metrics/standalone/dashboard.json
--- a/grafana/dashboards/metrics/standalone/dashboard.md
+++ b/grafana/dashboards/metrics/standalone/dashboard.md
@@ -21,14 +21,14 @@
 # Resources
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
-| Datanode Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{instance}}]-[{{ pod }}]` |
-| Datanode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
-| Frontend Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{ instance }}]-[{{ pod }}]` |
-| Frontend CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]-cpu` |
-| Metasrv Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{ instance }}]-[{{ pod }}]-resident` |
-| Metasrv CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
-| Flownode Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)` | `timeseries` | Current memory usage by instance | `prometheus` | `decbytes` | `[{{ instance }}]-[{{ pod }}]` |
-| Flownode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
+| Datanode Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-datanode"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{instance}}]-[{{ pod }}]` |
+| Datanode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-datanode"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
+| Frontend Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-frontend"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{ instance }}]-[{{ pod }}]` |
+| Frontend CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-frontend"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]-cpu` |
+| Metasrv Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-metasrv"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{ instance }}]-[{{ pod }}]-resident` |
+| Metasrv CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-metasrv"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
+| Flownode Memory per Instance | `sum(process_resident_memory_bytes{}) by (instance, pod)`<br/>`max(greptime_memory_limit_in_bytes{app="greptime-flownode"})` | `timeseries` | Current memory usage by instance | `prometheus` | `bytes` | `[{{ instance }}]-[{{ pod }}]` |
+| Flownode CPU Usage per Instance | `sum(rate(process_cpu_seconds_total{}[$__rate_interval]) * 1000) by (instance, pod)`<br/>`max(greptime_cpu_limit_in_millicores{app="greptime-flownode"})` | `timeseries` | Current cpu usage by instance | `prometheus` | `none` | `[{{ instance }}]-[{{ pod }}]` |
 # Frontend Requests
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
@@ -72,18 +72,19 @@
 | Region Worker Handle Bulk Insert Requests | `histogram_quantile(0.95, sum by(le,instance, stage, pod) (rate(greptime_region_worker_handle_write_bucket[$__rate_interval])))`<br/>`sum by(instance, stage, pod) (rate(greptime_region_worker_handle_write_sum[$__rate_interval]))/sum by(instance, stage, pod) (rate(greptime_region_worker_handle_write_count[$__rate_interval]))` | `timeseries` | Per-stage elapsed time for region worker to handle bulk insert region requests. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{stage}}]-P95` |
 | Active Series and Field Builders Count | `sum by(instance, pod) (greptime_mito_memtable_active_series_count)`<br/>`sum by(instance, pod) (greptime_mito_memtable_field_builder_count)` | `timeseries` | Compaction oinput output bytes | `prometheus` | `none` | `[{{instance}}]-[{{pod}}]-series` |
 | Region Worker Convert Requests | `histogram_quantile(0.95, sum by(le, instance, stage, pod) (rate(greptime_datanode_convert_region_request_bucket[$__rate_interval])))`<br/>`sum by(le,instance, stage, pod) (rate(greptime_datanode_convert_region_request_sum[$__rate_interval]))/sum by(le,instance, stage, pod) (rate(greptime_datanode_convert_region_request_count[$__rate_interval]))` | `timeseries` | Per-stage elapsed time for region worker to decode requests. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{stage}}]-P95` |
+| Cache Miss | `sum by (instance,pod, type) (rate(greptime_mito_cache_miss{}[$__rate_interval]))` | `timeseries` | The local cache miss of the datanode. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]` |
 # OpenDAL
 | Title | Query | Type | Description | Datasource | Unit | Legend Format |
 | --- | --- | --- | --- | --- | --- | --- |
 | QPS per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{}[$__rate_interval]))` | `timeseries` | QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
-| Read QPS per Instance | `sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{ operation="read"}[$__rate_interval]))` | `timeseries` | Read QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
-| Read P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{operation="read"}[$__rate_interval])))` | `timeseries` | Read P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-{{scheme}}` |
-| Write QPS per Instance | `sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{ operation="write"}[$__rate_interval]))` | `timeseries` | Write QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-{{scheme}}` |
-| Write P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{ operation="write"}[$__rate_interval])))` | `timeseries` | Write P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
+| Read QPS per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{ operation=~"read\|Reader::read"}[$__rate_interval]))` | `timeseries` | Read QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Read P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{operation=~"read\|Reader::read"}[$__rate_interval])))` | `timeseries` | Read P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Write QPS per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{ operation=~"write\|Writer::write\|Writer::close"}[$__rate_interval]))` | `timeseries` | Write QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Write P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{ operation =~ "Writer::write\|Writer::close\|write"}[$__rate_interval])))` | `timeseries` | Write P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
 | List QPS per Instance | `sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{ operation="list"}[$__rate_interval]))` | `timeseries` | List QPS per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
 | List P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{ operation="list"}[$__rate_interval])))` | `timeseries` | List P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]` |
 | Other Requests per Instance | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{operation!~"read\|write\|list\|stat"}[$__rate_interval]))` | `timeseries` | Other Requests per Instance. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
-| Other Request P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{ operation!~"read\|write\|list"}[$__rate_interval])))` | `timeseries` | Other Request P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
+| Other Request P99 per Instance | `histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{ operation!~"read\|write\|list\|Writer::write\|Writer::close\|Reader::read"}[$__rate_interval])))` | `timeseries` | Other Request P99 per Instance. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
 | Opendal traffic | `sum by(instance, pod, scheme, operation) (rate(opendal_operation_bytes_sum{}[$__rate_interval]))` | `timeseries` | Total traffic as in bytes by instance and operation | `prometheus` | `decbytes` | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]` |
 | OpenDAL errors per Instance | `sum by(instance, pod, scheme, operation, error) (rate(opendal_operation_errors_total{ error!="NotFound"}[$__rate_interval]))` | `timeseries` | OpenDAL error counts per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]-[{{error}}]` |
 # Metasrv
--- a/grafana/dashboards/metrics/standalone/dashboard.yaml
+++ b/grafana/dashboards/metrics/standalone/dashboard.yaml
@@ -180,13 +180,18 @@ groups:
        - title: Datanode Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{instance}}]-[{{ pod }}]'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-datanode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Datanode CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -197,16 +202,26 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-datanode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Frontend Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-frontend"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Frontend CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -217,16 +232,26 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]-cpu'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-frontend"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Metasrv Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]-resident'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-metasrv"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Metasrv CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -237,16 +262,26 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-metasrv"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Flownode Memory per Instance
          type: timeseries
          description: Current memory usage by instance
-          unit: decbytes
+          unit: bytes
          queries:
            - expr: sum(process_resident_memory_bytes{}) by (instance, pod)
              datasource:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_memory_limit_in_bytes{app="greptime-flownode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
        - title: Flownode CPU Usage per Instance
          type: timeseries
          description: Current cpu usage by instance
@@ -257,6 +292,11 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{ instance }}]-[{{ pod }}]'
+            - expr: max(greptime_cpu_limit_in_millicores{app="greptime-flownode"})
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: limit
    - title: Frontend Requests
      panels:
        - title: HTTP QPS per Instance
@@ -642,6 +682,15 @@ groups:
                type: prometheus
                uid: ${metrics}
              legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-AVG'
+        - title: Cache Miss
+          type: timeseries
+          description: The local cache miss of the datanode.
+          queries:
+            - expr: sum by (instance,pod, type) (rate(greptime_mito_cache_miss{}[$__rate_interval]))
+              datasource:
+                type: prometheus
+                uid: ${metrics}
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]'
    - title: OpenDAL
      panels:
        - title: QPS per Instance
@@ -659,41 +708,41 @@ groups:
          description: Read QPS per Instance.
          unit: ops
          queries:
-            - expr: sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{ operation="read"}[$__rate_interval]))
+            - expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{ operation=~"read|Reader::read"}[$__rate_interval]))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: Read P99 per Instance
          type: timeseries
          description: Read P99 per Instance.
          unit: s
          queries:
-            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{operation="read"}[$__rate_interval])))
+            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{operation=~"read|Reader::read"}[$__rate_interval])))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-{{scheme}}'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: Write QPS per Instance
          type: timeseries
          description: Write QPS per Instance.
          unit: ops
          queries:
-            - expr: sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{ operation="write"}[$__rate_interval]))
+            - expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{ operation=~"write|Writer::write|Writer::close"}[$__rate_interval]))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-{{scheme}}'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: Write P99 per Instance
          type: timeseries
          description: Write P99 per Instance.
          unit: s
          queries:
-            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{ operation="write"}[$__rate_interval])))
+            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{ operation =~ "Writer::write|Writer::close|write"}[$__rate_interval])))
              datasource:
                type: prometheus
                uid: ${metrics}
-              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]'
+              legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
        - title: List QPS per Instance
          type: timeseries
          description: List QPS per Instance.
@@ -729,7 +778,7 @@ groups:
          description: Other Request P99 per Instance.
          unit: s
          queries:
-            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{ operation!~"read|write|list"}[$__rate_interval])))
+            - expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{ operation!~"read|write|list|Writer::write|Writer::close|Reader::read"}[$__rate_interval])))
              datasource:
                type: prometheus
                uid: ${metrics}
--- a/grafana/scripts/check.sh
+++ b/grafana/scripts/check.sh
@@ -26,7 +26,7 @@ check_dashboards_generation() {
  ./grafana/scripts/gen-dashboards.sh

  if [[ -n "$(git diff --name-only grafana/dashboards/metrics)" ]]; then
-    echo "Error: The dashboards are not generated correctly. You should execute the `make dashboards` command."
+    echo "Error: The dashboards are not generated correctly. You should execute the 'make dashboards' command."
    exit 1
  fi
 }
--- a/licenserc.toml
+++ b/licenserc.toml
@@ -29,9 +29,11 @@ excludes = [
    # enterprise
    "src/common/meta/src/rpc/ddl/trigger.rs",
    "src/operator/src/expr_helper/trigger.rs",
+    "src/sql/src/statements/alter/trigger.rs",
    "src/sql/src/statements/create/trigger.rs",
    "src/sql/src/statements/show/trigger.rs",
    "src/sql/src/statements/drop/trigger.rs",
+    "src/sql/src/parsers/alter_parser/trigger.rs",
    "src/sql/src/parsers/create_parser/trigger.rs",
    "src/sql/src/parsers/show_parser/trigger.rs",
    "src/mito2/src/extension.rs",
--- a/scripts/check-super-imports.py
+++ b/scripts/check-super-imports.py
@@ -13,8 +13,8 @@
 # limitations under the License.

 import os
-import re
 from multiprocessing import Pool
+from pathlib import Path


 def find_rust_files(directory):
@@ -24,6 +24,10 @@ def find_rust_files(directory):
        if "test" in root.lower():
            continue

+        # Skip the target directory
+        if "target" in Path(root).parts:
+            continue
+
        for file in files:
            # Skip files with "test" in the filename
            if "test" in file.lower():
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -53,6 +53,54 @@ get_arch_type() {
  esac
 }

+# Verify SHA256 checksum
+verify_sha256() {
+  file="$1"
+  expected_sha256="$2"
+
+  if command -v sha256sum >/dev/null 2>&1; then
+    actual_sha256=$(sha256sum "$file" | cut -d' ' -f1)
+  elif command -v shasum >/dev/null 2>&1; then
+    actual_sha256=$(shasum -a 256 "$file" | cut -d' ' -f1)
+  else
+    echo "Warning: No SHA256 verification tool found (sha256sum or shasum). Skipping checksum verification."
+    return 0
+  fi
+
+  if [ "$actual_sha256" = "$expected_sha256" ]; then
+    echo "SHA256 checksum verified successfully."
+    return 0
+  else
+    echo "Error: SHA256 checksum verification failed!"
+    echo "Expected: $expected_sha256"
+    echo "Actual: $actual_sha256"
+    return 1
+  fi
+}
+
+# Prompt for user confirmation (compatible with different shells)
+prompt_confirmation() {
+  message="$1"
+  printf "%s (y/N): " "$message"
+
+  # Try to read user input, fallback if read fails
+  answer=""
+  if read answer </dev/tty 2>/dev/null; then
+    case "$answer" in
+      [Yy]|[Yy][Ee][Ss])
+        return 0
+        ;;
+      *)
+        return 1
+        ;;
+    esac
+  else
+    echo ""
+    echo "Cannot read user input. Defaulting to No."
+    return 1
+  fi
+}
+
 download_artifact() {
  if [ -n "${OS_TYPE}" ] && [ -n "${ARCH_TYPE}" ]; then
    # Use the latest stable released version.
@@ -71,17 +119,104 @@ download_artifact() {
    fi

    echo "Downloading ${BIN}, OS: ${OS_TYPE}, Arch: ${ARCH_TYPE}, Version: ${VERSION}"
-    PACKAGE_NAME="${BIN}-${OS_TYPE}-${ARCH_TYPE}-${VERSION}.tar.gz"
+    PKG_NAME="${BIN}-${OS_TYPE}-${ARCH_TYPE}-${VERSION}"
+    PACKAGE_NAME="${PKG_NAME}.tar.gz"
+    SHA256_FILE="${PKG_NAME}.sha256sum"

    if [ -n "${PACKAGE_NAME}" ]; then
-      wget "https://github.com/${GITHUB_ORG}/${GITHUB_REPO}/releases/download/${VERSION}/${PACKAGE_NAME}"
+      # Check if files already exist and prompt for override
+      if [ -f "${PACKAGE_NAME}" ]; then
+        echo "File ${PACKAGE_NAME} already exists."
+        if prompt_confirmation "Do you want to override it?"; then
+          echo "Overriding existing file..."
+          rm -f "${PACKAGE_NAME}"
+        else
+          echo "Skipping download. Using existing file."
+        fi
+      fi
+
+      if [ -f "${BIN}" ]; then
+        echo "Binary ${BIN} already exists."
+        if prompt_confirmation "Do you want to override it?"; then
+          echo "Will override existing binary..."
+          rm -f "${BIN}"
+        else
+          echo "Installation cancelled."
+          exit 0
+        fi
+      fi
+
+      # Download package if not exists
+      if [ ! -f "${PACKAGE_NAME}" ]; then
+        echo "Downloading ${PACKAGE_NAME}..."
+        # Use curl instead of wget for better compatibility
+        if command -v curl >/dev/null 2>&1; then
+          if ! curl -L -o "${PACKAGE_NAME}" "https://github.com/${GITHUB_ORG}/${GITHUB_REPO}/releases/download/${VERSION}/${PACKAGE_NAME}"; then
+            echo "Error: Failed to download ${PACKAGE_NAME}"
+            exit 1
+          fi
+        elif command -v wget >/dev/null 2>&1; then
+          if ! wget -O "${PACKAGE_NAME}" "https://github.com/${GITHUB_ORG}/${GITHUB_REPO}/releases/download/${VERSION}/${PACKAGE_NAME}"; then
+            echo "Error: Failed to download ${PACKAGE_NAME}"
+            exit 1
+          fi
+        else
+          echo "Error: Neither curl nor wget is available for downloading."
+          exit 1
+        fi
+      fi
+
+      # Download and verify SHA256 checksum
+      echo "Downloading SHA256 checksum..."
+      sha256_download_success=0
+      if command -v curl >/dev/null 2>&1; then
+        if curl -L -s -o "${SHA256_FILE}" "https://github.com/${GITHUB_ORG}/${GITHUB_REPO}/releases/download/${VERSION}/${SHA256_FILE}" 2>/dev/null; then
+          sha256_download_success=1
+        fi
+      elif command -v wget >/dev/null 2>&1; then
+        if wget -q -O "${SHA256_FILE}" "https://github.com/${GITHUB_ORG}/${GITHUB_REPO}/releases/download/${VERSION}/${SHA256_FILE}" 2>/dev/null; then
+          sha256_download_success=1
+        fi
+      fi
+
+      if [ $sha256_download_success -eq 1 ] && [ -f "${SHA256_FILE}" ]; then
+        expected_sha256=$(cat "${SHA256_FILE}" | cut -d' ' -f1)
+        if [ -n "$expected_sha256" ]; then
+          if ! verify_sha256 "${PACKAGE_NAME}" "${expected_sha256}"; then
+            echo "SHA256 verification failed. Removing downloaded file."
+            rm -f "${PACKAGE_NAME}" "${SHA256_FILE}"
+            exit 1
+          fi
+        else
+          echo "Warning: Could not parse SHA256 checksum from file."
+        fi
+        rm -f "${SHA256_FILE}"
+      else
+        echo "Warning: Could not download SHA256 checksum file. Skipping verification."
+      fi

      # Extract the binary and clean the rest.
-      tar xvf "${PACKAGE_NAME}" && \
-      mv "${PACKAGE_NAME%.tar.gz}/${BIN}" "${PWD}" && \
-      rm -r "${PACKAGE_NAME}" && \
-      rm -r "${PACKAGE_NAME%.tar.gz}" && \
-      echo "Run './${BIN} --help' to get started"
+      echo "Extracting ${PACKAGE_NAME}..."
+      if ! tar xf "${PACKAGE_NAME}"; then
+        echo "Error: Failed to extract ${PACKAGE_NAME}"
+        exit 1
+      fi
+
+      # Find the binary in the extracted directory
+      extracted_dir="${PACKAGE_NAME%.tar.gz}"
+      if [ -f "${extracted_dir}/${BIN}" ]; then
+        mv "${extracted_dir}/${BIN}" "${PWD}/"
+        rm -f "${PACKAGE_NAME}"
+        rm -rf "${extracted_dir}"
+        chmod +x "${BIN}"
+        echo "Installation completed successfully!"
+        echo "Run './${BIN} --help' to get started"
+      else
+        echo "Error: Binary ${BIN} not found in extracted archive"
+        rm -f "${PACKAGE_NAME}"
+        rm -rf "${extracted_dir}"
+        exit 1
+      fi
    fi
  fi
 }
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -291,6 +291,7 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
            ConcreteDataType::Vector(_) => ColumnDataType::Vector,
            ConcreteDataType::Null(_)
            | ConcreteDataType::List(_)
+            | ConcreteDataType::Struct(_)
            | ConcreteDataType::Dictionary(_)
            | ConcreteDataType::Duration(_) => {
                return error::IntoColumnDataTypeSnafu { from: datatype }.fail()
@@ -703,6 +704,7 @@ pub fn pb_values_to_vector_ref(data_type: &ConcreteDataType, values: Values) ->
        ConcreteDataType::Vector(_) => Arc::new(BinaryVector::from_vec(values.binary_values)),
        ConcreteDataType::Null(_)
        | ConcreteDataType::List(_)
+        | ConcreteDataType::Struct(_)
        | ConcreteDataType::Dictionary(_)
        | ConcreteDataType::Duration(_)
        | ConcreteDataType::Json(_) => {
@@ -864,6 +866,7 @@ pub fn pb_values_to_values(data_type: &ConcreteDataType, values: Values) -> Vec<
        ConcreteDataType::Vector(_) => values.binary_values.into_iter().map(|v| v.into()).collect(),
        ConcreteDataType::Null(_)
        | ConcreteDataType::List(_)
+        | ConcreteDataType::Struct(_)
        | ConcreteDataType::Dictionary(_)
        | ConcreteDataType::Duration(_)
        | ConcreteDataType::Json(_) => {
--- a/src/api/src/v1/column_def.rs
+++ b/src/api/src/v1/column_def.rs
@@ -24,7 +24,7 @@ use greptime_proto::v1::{
 };
 use snafu::ResultExt;

-use crate::error::{self, Result};
+use crate::error::{self, ConvertColumnDefaultConstraintSnafu, Result};
 use crate::helper::ColumnDataTypeWrapper;
 use crate::v1::{ColumnDef, ColumnOptions, SemanticType};

@@ -77,6 +77,48 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
        })
 }

+/// Tries to construct a `ColumnDef` from the given `ColumnSchema`.
+///
+/// TODO(weny): Add tests for this function.
+pub fn try_as_column_def(column_schema: &ColumnSchema, is_primary_key: bool) -> Result<ColumnDef> {
+    let column_datatype =
+        ColumnDataTypeWrapper::try_from(column_schema.data_type.clone()).map(|w| w.to_parts())?;
+
+    let semantic_type = if column_schema.is_time_index() {
+        SemanticType::Timestamp
+    } else if is_primary_key {
+        SemanticType::Tag
+    } else {
+        SemanticType::Field
+    } as i32;
+    let comment = column_schema
+        .metadata()
+        .get(COMMENT_KEY)
+        .cloned()
+        .unwrap_or_default();
+
+    let default_constraint = match column_schema.default_constraint() {
+        None => vec![],
+        Some(v) => v
+            .clone()
+            .try_into()
+            .context(ConvertColumnDefaultConstraintSnafu {
+                column: &column_schema.name,
+            })?,
+    };
+    let options = options_from_column_schema(column_schema);
+    Ok(ColumnDef {
+        name: column_schema.name.clone(),
+        data_type: column_datatype.0 as i32,
+        is_nullable: column_schema.is_nullable(),
+        default_constraint,
+        semantic_type,
+        comment,
+        datatype_extension: column_datatype.1,
+        options,
+    })
+}
+
 /// Constructs a `ColumnOptions` from the given `ColumnSchema`.
 pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<ColumnOptions> {
    let mut options = ColumnOptions::default();
--- a/src/catalog/src/information_extension.rs
+++ b/src/catalog/src/information_extension.rs
@@ -16,8 +16,8 @@ use api::v1::meta::ProcedureStatus;
 use common_error::ext::BoxedError;
 use common_meta::cluster::{ClusterInfo, NodeInfo};
 use common_meta::datanode::RegionStat;
-use common_meta::ddl::{ExecutorContext, ProcedureExecutor};
 use common_meta::key::flow::flow_state::FlowStat;
+use common_meta::procedure_executor::{ExecutorContext, ProcedureExecutor};
 use common_meta::rpc::procedure;
 use common_procedure::{ProcedureInfo, ProcedureState};
 use meta_client::MetaClientRef;
--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -162,6 +162,16 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
    }

    fn system_table(&self, name: &str) -> Option<SystemTableRef> {
+        #[cfg(feature = "enterprise")]
+        if let Some(factory) = self.extra_table_factories.get(name) {
+            let req = MakeInformationTableRequest {
+                catalog_name: self.catalog_name.clone(),
+                catalog_manager: self.catalog_manager.clone(),
+                kv_backend: self.kv_backend.clone(),
+            };
+            return Some(factory.make_information_table(req));
+        }
+
        match name.to_ascii_lowercase().as_str() {
            TABLES => Some(Arc::new(InformationSchemaTables::new(
                self.catalog_name.clone(),
@@ -240,22 +250,7 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
                .process_manager
                .as_ref()
                .map(|p| Arc::new(InformationSchemaProcessList::new(p.clone())) as _),
-            table_name => {
-                #[cfg(feature = "enterprise")]
-                return self.extra_table_factories.get(table_name).map(|factory| {
-                    let req = MakeInformationTableRequest {
-                        catalog_name: self.catalog_name.clone(),
-                        catalog_manager: self.catalog_manager.clone(),
-                        kv_backend: self.kv_backend.clone(),
-                    };
-                    factory.make_information_table(req)
-                });
-                #[cfg(not(feature = "enterprise"))]
-                {
-                    let _ = table_name;
-                    None
-                }
-            }
+            _ => None,
        }
    }
 }
--- a/src/catalog/src/system_schema/information_schema/information_memory_table.rs
+++ b/src/catalog/src/system_schema/information_schema/information_memory_table.rs
@@ -15,7 +15,8 @@
 use std::sync::Arc;

 use common_catalog::consts::{METRIC_ENGINE, MITO_ENGINE};
-use datatypes::schema::{Schema, SchemaRef};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
 use datatypes::vectors::{Int64Vector, StringVector, VectorRef};

 use crate::system_schema::information_schema::table_names::*;
@@ -367,28 +368,18 @@ pub(super) fn get_schema_columns(table_name: &str) -> (SchemaRef, Vec<VectorRef>

        TRIGGERS => (
            vec![
-                string_column("TRIGGER_CATALOG"),
-                string_column("TRIGGER_SCHEMA"),
                string_column("TRIGGER_NAME"),
-                string_column("EVENT_MANIPULATION"),
-                string_column("EVENT_OBJECT_CATALOG"),
-                string_column("EVENT_OBJECT_SCHEMA"),
-                string_column("EVENT_OBJECT_TABLE"),
-                bigint_column("ACTION_ORDER"),
-                string_column("ACTION_CONDITION"),
-                string_column("ACTION_STATEMENT"),
-                string_column("ACTION_ORIENTATION"),
-                string_column("ACTION_TIMING"),
-                string_column("ACTION_REFERENCE_OLD_TABLE"),
-                string_column("ACTION_REFERENCE_NEW_TABLE"),
-                string_column("ACTION_REFERENCE_OLD_ROW"),
-                string_column("ACTION_REFERENCE_NEW_ROW"),
-                timestamp_micro_column("CREATED"),
-                string_column("SQL_MODE"),
-                string_column("DEFINER"),
-                string_column("CHARACTER_SET_CLIENT"),
-                string_column("COLLATION_CONNECTION"),
-                string_column("DATABASE_COLLATION"),
+                ColumnSchema::new(
+                    "trigger_id",
+                    ConcreteDataType::uint64_datatype(),
+                    false,
+                ),
+                string_column("TRIGGER_DEFINITION"),
+                ColumnSchema::new(
+                    "flownode_id",
+                    ConcreteDataType::uint64_datatype(),
+                    true,
+                ),
            ],
            vec![],
        ),
--- a/src/catalog/src/system_schema/information_schema/partitions.rs
+++ b/src/catalog/src/system_schema/information_schema/partitions.rs
@@ -329,13 +329,8 @@ impl InformationSchemaPartitionsBuilder {
            self.partition_names.push(Some(&partition_name));
            self.partition_ordinal_positions
                .push(Some((index + 1) as i64));
-            let expressions = if partition.partition.partition_columns().is_empty() {
-                None
-            } else {
-                Some(partition.partition.to_string())
-            };
-
-            self.partition_expressions.push(expressions.as_deref());
+            let expression = partition.partition_expr.as_ref().map(|e| e.to_string());
+            self.partition_expressions.push(expression.as_deref());
            self.create_times.push(Some(TimestampMicrosecond::from(
                table_info.meta.created_on.timestamp_millis(),
            )));
--- a/src/catalog/src/system_schema/information_schema/region_statistics.rs
+++ b/src/catalog/src/system_schema/information_schema/region_statistics.rs
@@ -44,6 +44,7 @@ const DISK_SIZE: &str = "disk_size";
 const MEMTABLE_SIZE: &str = "memtable_size";
 const MANIFEST_SIZE: &str = "manifest_size";
 const SST_SIZE: &str = "sst_size";
+const SST_NUM: &str = "sst_num";
 const INDEX_SIZE: &str = "index_size";
 const ENGINE: &str = "engine";
 const REGION_ROLE: &str = "region_role";
@@ -87,6 +88,7 @@ impl InformationSchemaRegionStatistics {
            ColumnSchema::new(MEMTABLE_SIZE, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(MANIFEST_SIZE, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(SST_SIZE, ConcreteDataType::uint64_datatype(), true),
+            ColumnSchema::new(SST_NUM, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(INDEX_SIZE, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(ENGINE, ConcreteDataType::string_datatype(), true),
            ColumnSchema::new(REGION_ROLE, ConcreteDataType::string_datatype(), true),
@@ -149,6 +151,7 @@ struct InformationSchemaRegionStatisticsBuilder {
    memtable_sizes: UInt64VectorBuilder,
    manifest_sizes: UInt64VectorBuilder,
    sst_sizes: UInt64VectorBuilder,
+    sst_nums: UInt64VectorBuilder,
    index_sizes: UInt64VectorBuilder,
    engines: StringVectorBuilder,
    region_roles: StringVectorBuilder,
@@ -167,6 +170,7 @@ impl InformationSchemaRegionStatisticsBuilder {
            memtable_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            manifest_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            sst_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
+            sst_nums: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            index_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            engines: StringVectorBuilder::with_capacity(INIT_CAPACITY),
            region_roles: StringVectorBuilder::with_capacity(INIT_CAPACITY),
@@ -197,6 +201,7 @@ impl InformationSchemaRegionStatisticsBuilder {
            (MEMTABLE_SIZE, &Value::from(region_stat.memtable_size)),
            (MANIFEST_SIZE, &Value::from(region_stat.manifest_size)),
            (SST_SIZE, &Value::from(region_stat.sst_size)),
+            (SST_NUM, &Value::from(region_stat.sst_num)),
            (INDEX_SIZE, &Value::from(region_stat.index_size)),
            (ENGINE, &Value::from(region_stat.engine.as_str())),
            (REGION_ROLE, &Value::from(region_stat.role.to_string())),
@@ -215,6 +220,7 @@ impl InformationSchemaRegionStatisticsBuilder {
        self.memtable_sizes.push(Some(region_stat.memtable_size));
        self.manifest_sizes.push(Some(region_stat.manifest_size));
        self.sst_sizes.push(Some(region_stat.sst_size));
+        self.sst_nums.push(Some(region_stat.sst_num));
        self.index_sizes.push(Some(region_stat.index_size));
        self.engines.push(Some(&region_stat.engine));
        self.region_roles.push(Some(&region_stat.role.to_string()));
@@ -230,6 +236,7 @@ impl InformationSchemaRegionStatisticsBuilder {
            Arc::new(self.memtable_sizes.finish()),
            Arc::new(self.manifest_sizes.finish()),
            Arc::new(self.sst_sizes.finish()),
+            Arc::new(self.sst_nums.finish()),
            Arc::new(self.index_sizes.finish()),
            Arc::new(self.engines.finish()),
            Arc::new(self.region_roles.finish()),
--- a/src/catalog/src/system_schema/information_schema/table_names.rs
+++ b/src/catalog/src/system_schema/information_schema/table_names.rs
@@ -48,4 +48,3 @@ pub const FLOWS: &str = "flows";
 pub const PROCEDURE_INFO: &str = "procedure_info";
 pub const REGION_STATISTICS: &str = "region_statistics";
 pub const PROCESS_LIST: &str = "process_list";
-pub const TRIGGER_LIST: &str = "trigger_list";
--- a/src/catalog/src/system_schema/pg_catalog/pg_class.rs
+++ b/src/catalog/src/system_schema/pg_catalog/pg_class.rs
@@ -169,7 +169,7 @@ impl DfPartitionStream for PGClass {
 }

 /// Builds the `pg_catalog.pg_class` table row by row
-/// TODO(J0HN50N133): `relowner` is always the [`DUMMY_OWNER_ID`] cuz we don't have user.
+/// TODO(J0HN50N133): `relowner` is always the [`DUMMY_OWNER_ID`] because we don't have users.
 /// Once we have user system, make it the actual owner of the table.
 struct PGClassBuilder {
    schema: SchemaRef,
--- a/src/catalog/src/table_source.rs
+++ b/src/catalog/src/table_source.rs
@@ -70,6 +70,11 @@ impl DfTableSourceProvider {
        }
    }

+    /// Returns the query context.
+    pub fn query_ctx(&self) -> &QueryContextRef {
+        &self.query_ctx
+    }
+
    pub fn resolve_table_ref(&self, table_ref: TableReference) -> Result<ResolvedTableReference> {
        if self.disallow_cross_catalog_query {
            match &table_ref {
--- a/src/cli/src/bench.rs
+++ b/src/cli/src/bench.rs
@@ -188,6 +188,7 @@ fn create_region_routes(regions: Vec<RegionNumber>) -> Vec<RegionRoute> {
                name: String::new(),
                partition: None,
                attrs: BTreeMap::new(),
+                partition_expr: Default::default(),
            },
            leader_peer: Some(Peer {
                id: rng.random_range(0..10),
--- a/src/cli/src/metadata/common.rs
+++ b/src/cli/src/metadata/common.rs
@@ -75,7 +75,7 @@ impl StoreConfig {
                #[cfg(feature = "pg_kvbackend")]
                BackendImpl::PostgresStore => {
                    let table_name = &self.meta_table_name;
-                    let pool = meta_srv::bootstrap::create_postgres_pool(store_addrs)
+                    let pool = meta_srv::bootstrap::create_postgres_pool(store_addrs, None)
                        .await
                        .map_err(BoxedError::new)?;
                    Ok(common_meta::kv_backend::rds::PgStore::with_pg_pool(
--- a/src/cli/src/metadata/repair.rs
+++ b/src/cli/src/metadata/repair.rs
@@ -241,7 +241,6 @@ impl RepairTool {
            let alter_table_request = alter_table::make_alter_region_request_for_peer(
                logical_table_id,
                &alter_table_expr,
-                full_table_metadata.table_info.ident.version,
                peer,
                physical_region_routes,
            )?;
--- a/src/cli/src/metadata/repair/alter_table.rs
+++ b/src/cli/src/metadata/repair/alter_table.rs
@@ -66,7 +66,6 @@ pub fn generate_alter_table_expr_for_all_columns(
 pub fn make_alter_region_request_for_peer(
    logical_table_id: TableId,
    alter_table_expr: &AlterTableExpr,
-    schema_version: u64,
    peer: &Peer,
    region_routes: &[RegionRoute],
 ) -> Result<RegionRequest> {
@@ -74,7 +73,7 @@ pub fn make_alter_region_request_for_peer(
    let mut requests = Vec::with_capacity(regions_on_this_peer.len());
    for region_number in &regions_on_this_peer {
        let region_id = RegionId::new(logical_table_id, *region_number);
-        let request = make_alter_region_request(region_id, alter_table_expr, schema_version);
+        let request = make_alter_region_request(region_id, alter_table_expr);
        requests.push(request);
    }

--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -23,7 +23,7 @@ use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
 use api::v1::{
    AlterTableExpr, AuthHeader, Basic, CreateTableExpr, DdlRequest, GreptimeRequest,
-    InsertRequests, QueryRequest, RequestHeader,
+    InsertRequests, QueryRequest, RequestHeader, RowInsertRequests,
 };
 use arrow_flight::{FlightData, Ticket};
 use async_stream::stream;
@@ -42,7 +42,7 @@ use common_telemetry::{error, warn};
 use futures::future;
 use futures_util::{Stream, StreamExt, TryStreamExt};
 use prost::Message;
-use snafu::{ensure, ResultExt};
+use snafu::{ensure, OptionExt, ResultExt};
 use tonic::metadata::{AsciiMetadataKey, AsciiMetadataValue, MetadataMap, MetadataValue};
 use tonic::transport::Channel;

@@ -118,6 +118,7 @@ impl Database {
        }
    }

+    /// Set the catalog for the database client.
    pub fn set_catalog(&mut self, catalog: impl Into<String>) {
        self.catalog = catalog.into();
    }
@@ -130,6 +131,7 @@ impl Database {
        }
    }

+    /// Set the schema for the database client.
    pub fn set_schema(&mut self, schema: impl Into<String>) {
        self.schema = schema.into();
    }
@@ -142,20 +144,24 @@ impl Database {
        }
    }

+    /// Set the timezone for the database client.
    pub fn set_timezone(&mut self, timezone: impl Into<String>) {
        self.timezone = timezone.into();
    }

+    /// Set the auth scheme for the database client.
    pub fn set_auth(&mut self, auth: AuthScheme) {
        self.ctx.auth_header = Some(AuthHeader {
            auth_scheme: Some(auth),
        });
    }

+    /// Make an InsertRequests request to the database.
    pub async fn insert(&self, requests: InsertRequests) -> Result<u32> {
        self.handle(Request::Inserts(requests)).await
    }

+    /// Make an InsertRequests request to the database with hints.
    pub async fn insert_with_hints(
        &self,
        requests: InsertRequests,
@@ -172,6 +178,28 @@ impl Database {
        from_grpc_response(response)
    }

+    /// Make a RowInsertRequests request to the database.
+    pub async fn row_inserts(&self, requests: RowInsertRequests) -> Result<u32> {
+        self.handle(Request::RowInserts(requests)).await
+    }
+
+    /// Make a RowInsertRequests request to the database with hints.
+    pub async fn row_inserts_with_hints(
+        &self,
+        requests: RowInsertRequests,
+        hints: &[(&str, &str)],
+    ) -> Result<u32> {
+        let mut client = make_database_client(&self.client)?.inner;
+        let request = self.to_rpc_request(Request::RowInserts(requests));
+
+        let mut request = tonic::Request::new(request);
+        let metadata = request.metadata_mut();
+        Self::put_hints(metadata, hints)?;
+
+        let response = client.handle(request).await?.into_inner();
+        from_grpc_response(response)
+    }
+
    fn put_hints(metadata: &mut MetadataMap, hints: &[(&str, &str)]) -> Result<()> {
        let Some(value) = hints
            .iter()
@@ -187,6 +215,7 @@ impl Database {
        Ok(())
    }

+    /// Make a request to the database.
    pub async fn handle(&self, request: Request) -> Result<u32> {
        let mut client = make_database_client(&self.client)?.inner;
        let request = self.to_rpc_request(request);
@@ -221,12 +250,18 @@ impl Database {
                        retries += 1;
                        warn!("Retrying {} times with error = {:?}", retries, err);
                        continue;
+                    } else {
+                        error!(
+                            err; "Failed to send request to grpc handle, retries = {}, not retryable error, aborting",
+                            retries
+                        );
+                        return Err(err.into());
                    }
                }
                (Err(err), false) => {
                    error!(
-                        "Failed to send request to grpc handle after {} retries, error = {:?}",
-                        retries, err
+                        err; "Failed to send request to grpc handle after {} retries",
+                        retries,
                    );
                    return Err(err.into());
                }
@@ -250,6 +285,7 @@ impl Database {
        }
    }

+    /// Executes a SQL query without any hints.
    pub async fn sql<S>(&self, sql: S) -> Result<Output>
    where
        S: AsRef<str>,
@@ -257,6 +293,7 @@ impl Database {
        self.sql_with_hint(sql, &[]).await
    }

+    /// Executes a SQL query with optional hints for query optimization.
    pub async fn sql_with_hint<S>(&self, sql: S, hints: &[(&str, &str)]) -> Result<Output>
    where
        S: AsRef<str>,
@@ -267,6 +304,7 @@ impl Database {
        self.do_get(request, hints).await
    }

+    /// Executes a logical plan directly without SQL parsing.
    pub async fn logical_plan(&self, logical_plan: Vec<u8>) -> Result<Output> {
        let request = Request::Query(QueryRequest {
            query: Some(Query::LogicalPlan(logical_plan)),
@@ -274,6 +312,7 @@ impl Database {
        self.do_get(request, &[]).await
    }

+    /// Creates a new table using the provided table expression.
    pub async fn create(&self, expr: CreateTableExpr) -> Result<Output> {
        let request = Request::Ddl(DdlRequest {
            expr: Some(DdlExpr::CreateTable(expr)),
@@ -281,6 +320,7 @@ impl Database {
        self.do_get(request, &[]).await
    }

+    /// Alters an existing table using the provided alter expression.
    pub async fn alter(&self, expr: AlterTableExpr) -> Result<Output> {
        let request = Request::Ddl(DdlRequest {
            expr: Some(DdlExpr::AlterTable(expr)),
@@ -321,7 +361,10 @@ impl Database {
        let mut flight_message_stream = flight_data_stream.map(move |flight_data| {
            flight_data
                .map_err(Error::from)
-                .and_then(|data| decoder.try_decode(&data).context(ConvertFlightDataSnafu))
+                .and_then(|data| decoder.try_decode(&data).context(ConvertFlightDataSnafu))?
+                .context(IllegalFlightMessagesSnafu {
+                    reason: "none message",
+                })
        });

        let Some(first_flight_message) = flight_message_stream.next().await else {
--- a/src/client/src/region.rs
+++ b/src/client/src/region.rs
@@ -128,7 +128,10 @@ impl RegionRequester {
        let mut flight_message_stream = flight_data_stream.map(move |flight_data| {
            flight_data
                .map_err(Error::from)
-                .and_then(|data| decoder.try_decode(&data).context(ConvertFlightDataSnafu))
+                .and_then(|data| decoder.try_decode(&data).context(ConvertFlightDataSnafu))?
+                .context(IllegalFlightMessagesSnafu {
+                    reason: "none message",
+                })
        });

        let Some(first_flight_message) = flight_message_stream.next().await else {
@@ -157,19 +160,70 @@ impl RegionRequester {
            let _span = tracing_context.attach(common_telemetry::tracing::info_span!(
                "poll_flight_data_stream"
            ));
-            while let Some(flight_message) = flight_message_stream.next().await {
-                let flight_message = flight_message
-                    .map_err(BoxedError::new)
-                    .context(ExternalSnafu)?;
+
+            let mut buffered_message: Option<FlightMessage> = None;
+            let mut stream_ended = false;
+
+            while !stream_ended {
+                // get the next message from the buffered message or read from the flight message stream
+                let flight_message_item = if let Some(msg) = buffered_message.take() {
+                    Some(Ok(msg))
+                } else {
+                    flight_message_stream.next().await
+                };
+
+                let flight_message = match flight_message_item {
+                    Some(Ok(message)) => message,
+                    Some(Err(e)) => {
+                        yield Err(BoxedError::new(e)).context(ExternalSnafu);
+                        break;
+                    }
+                    None => break,
+                };

                match flight_message {
                    FlightMessage::RecordBatch(record_batch) => {
-                        yield RecordBatch::try_from_df_record_batch(
+                        let result_to_yield = RecordBatch::try_from_df_record_batch(
                            schema_cloned.clone(),
                            record_batch,
-                        )
+                        );
+
+                        // get the next message from the stream. normally it should be a metrics message.
+                        if let Some(next_flight_message_result) = flight_message_stream.next().await
+                        {
+                            match next_flight_message_result {
+                                Ok(FlightMessage::Metrics(s)) => {
+                                    let m = serde_json::from_str(&s).ok().map(Arc::new);
+                                    metrics_ref.swap(m);
+                                }
+                                Ok(FlightMessage::RecordBatch(rb)) => {
+                                    // for some reason it's not a metrics message, so we need to buffer this record batch
+                                    // and yield it in the next iteration.
+                                    buffered_message = Some(FlightMessage::RecordBatch(rb));
+                                }
+                                Ok(_) => {
+                                    yield IllegalFlightMessagesSnafu {
+                                        reason: "A RecordBatch message can only be succeeded by a Metrics message or another RecordBatch message"
+                                    }
+                                    .fail()
+                                    .map_err(BoxedError::new)
+                                    .context(ExternalSnafu);
+                                    break;
+                                }
+                                Err(e) => {
+                                    yield Err(BoxedError::new(e)).context(ExternalSnafu);
+                                    break;
+                                }
+                            }
+                        } else {
+                            // the stream has ended
+                            stream_ended = true;
+                        }
+
+                        yield result_to_yield;
                    }
                    FlightMessage::Metrics(s) => {
+                        // just a branch in case of some metrics message comes after other things.
                        let m = serde_json::from_str(&s).ok().map(Arc::new);
                        metrics_ref.swap(m);
                        break;
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -38,6 +38,7 @@ common-config.workspace = true
 common-error.workspace = true
 common-grpc.workspace = true
 common-macro.workspace = true
+common-mem-prof.workspace = true
 common-meta.workspace = true
 common-options.workspace = true
 common-procedure.workspace = true
--- a/src/cmd/src/bin/greptime.rs
+++ b/src/cmd/src/bin/greptime.rs
@@ -20,11 +20,11 @@ use cmd::error::{InitTlsProviderSnafu, Result};
 use cmd::options::GlobalOptions;
 use cmd::{cli, datanode, flownode, frontend, metasrv, standalone, App};
 use common_base::Plugins;
-use common_version::version;
+use common_version::{verbose_version, version};
 use servers::install_ring_crypto_provider;

 #[derive(Parser)]
-#[command(name = "greptime", author, version, long_version = version(), about)]
+#[command(name = "greptime", author, version, long_version = verbose_version(), about)]
 #[command(propagate_version = true)]
 pub(crate) struct Command {
    #[clap(subcommand)]
@@ -143,10 +143,8 @@ async fn start(cli: Command) -> Result<()> {
 }

 fn setup_human_panic() {
-    human_panic::setup_panic!(
-        human_panic::Metadata::new("GreptimeDB", env!("CARGO_PKG_VERSION"))
-            .homepage("https://github.com/GreptimeTeam/greptimedb/discussions")
-    );
+    human_panic::setup_panic!(human_panic::Metadata::new("GreptimeDB", version())
+        .homepage("https://github.com/GreptimeTeam/greptimedb/discussions"));

    common_telemetry::set_panic_hook();
 }
--- a/src/cmd/src/datanode/builder.rs
+++ b/src/cmd/src/datanode/builder.rs
@@ -19,7 +19,7 @@ use catalog::kvbackend::MetaKvBackend;
 use common_base::Plugins;
 use common_meta::cache::LayeredCacheRegistryBuilder;
 use common_telemetry::info;
-use common_version::{short_version, version};
+use common_version::{short_version, verbose_version};
 use datanode::datanode::DatanodeBuilder;
 use datanode::service::DatanodeServiceBuilder;
 use meta_client::MetaClientType;
@@ -28,7 +28,7 @@ use tracing_appender::non_blocking::WorkerGuard;

 use crate::datanode::{DatanodeOptions, Instance, APP_NAME};
 use crate::error::{MetaClientInitSnafu, MissingConfigSnafu, Result, StartDatanodeSnafu};
-use crate::{create_resource_limit_metrics, log_versions};
+use crate::{create_resource_limit_metrics, log_versions, maybe_activate_heap_profile};

 /// Builder for Datanode instance.
 pub struct InstanceBuilder {
@@ -67,7 +67,8 @@ impl InstanceBuilder {
            None,
        );

-        log_versions(version(), short_version(), APP_NAME);
+        log_versions(verbose_version(), short_version(), APP_NAME);
+        maybe_activate_heap_profile(&dn_opts.memory);
        create_resource_limit_metrics(APP_NAME);

        plugins::setup_datanode_plugins(plugins, &opts.plugins, dn_opts)
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -32,7 +32,7 @@ use common_meta::key::flow::FlowMetadataManager;
 use common_meta::key::TableMetadataManager;
 use common_telemetry::info;
 use common_telemetry::logging::{TracingOptions, DEFAULT_LOGGING_DIR};
-use common_version::{short_version, version};
+use common_version::{short_version, verbose_version};
 use flow::{
    get_flow_auth_options, FlownodeBuilder, FlownodeInstance, FlownodeServiceBuilder,
    FrontendClient, FrontendInvoker,
@@ -46,7 +46,7 @@ use crate::error::{
    MissingConfigSnafu, Result, ShutdownFlownodeSnafu, StartFlownodeSnafu,
 };
 use crate::options::{GlobalOptions, GreptimeOptions};
-use crate::{create_resource_limit_metrics, log_versions, App};
+use crate::{create_resource_limit_metrics, log_versions, maybe_activate_heap_profile, App};

 pub const APP_NAME: &str = "greptime-flownode";

@@ -279,7 +279,8 @@ impl StartCommand {
            None,
        );

-        log_versions(version(), short_version(), APP_NAME);
+        log_versions(verbose_version(), short_version(), APP_NAME);
+        maybe_activate_heap_profile(&opts.component.memory);
        create_resource_limit_metrics(APP_NAME);

        info!("Flownode start command: {:#?}", self);
@@ -374,6 +375,7 @@ impl StartCommand {
            meta_client.clone(),
            flow_auth_header,
            opts.query.clone(),
+            opts.flow.batching_mode.clone(),
        );
        let frontend_client = Arc::new(frontend_client);
        let flownode_builder = FlownodeBuilder::new(
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -33,7 +33,7 @@ use common_meta::heartbeat::handler::HandlerGroupExecutor;
 use common_telemetry::info;
 use common_telemetry::logging::{TracingOptions, DEFAULT_LOGGING_DIR};
 use common_time::timezone::set_default_timezone;
-use common_version::{short_version, version};
+use common_version::{short_version, verbose_version};
 use frontend::frontend::Frontend;
 use frontend::heartbeat::HeartbeatTask;
 use frontend::instance::builder::FrontendBuilder;
@@ -47,7 +47,7 @@ use tracing_appender::non_blocking::WorkerGuard;

 use crate::error::{self, Result};
 use crate::options::{GlobalOptions, GreptimeOptions};
-use crate::{create_resource_limit_metrics, log_versions, App};
+use crate::{create_resource_limit_metrics, log_versions, maybe_activate_heap_profile, App};

 type FrontendOptions = GreptimeOptions<frontend::frontend::FrontendOptions>;

@@ -282,7 +282,8 @@ impl StartCommand {
            opts.component.slow_query.as_ref(),
        );

-        log_versions(version(), short_version(), APP_NAME);
+        log_versions(verbose_version(), short_version(), APP_NAME);
+        maybe_activate_heap_profile(&opts.component.memory);
        create_resource_limit_metrics(APP_NAME);

        info!("Frontend start command: {:#?}", self);
--- a/src/cmd/src/lib.rs
+++ b/src/cmd/src/lib.rs
@@ -15,7 +15,10 @@
 #![feature(assert_matches, let_chains)]

 use async_trait::async_trait;
-use common_telemetry::{error, info};
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_mem_prof::activate_heap_profile;
+use common_telemetry::{error, info, warn};
 use stat::{get_cpu_limit, get_memory_limit};

 use crate::error::Result;
@@ -112,7 +115,7 @@ pub trait App: Send {
 pub fn log_versions(version: &str, short_version: &str, app: &str) {
    // Report app version as gauge.
    APP_VERSION
-        .with_label_values(&[env!("CARGO_PKG_VERSION"), short_version, app])
+        .with_label_values(&[common_version::version(), short_version, app])
        .inc();

    // Log version and argument flags.
@@ -145,3 +148,20 @@ fn log_env_flags() {
        info!("argument: {}", argument);
    }
 }
+
+pub fn maybe_activate_heap_profile(memory_options: &common_options::memory::MemoryOptions) {
+    if memory_options.enable_heap_profiling {
+        match activate_heap_profile() {
+            Ok(()) => {
+                info!("Heap profile is active");
+            }
+            Err(err) => {
+                if err.status_code() == StatusCode::Unsupported {
+                    info!("Heap profile is not supported");
+                } else {
+                    warn!(err; "Failed to activate heap profile");
+                }
+            }
+        }
+    }
+}
--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -22,7 +22,7 @@ use common_base::Plugins;
 use common_config::Configurable;
 use common_telemetry::info;
 use common_telemetry::logging::{TracingOptions, DEFAULT_LOGGING_DIR};
-use common_version::{short_version, version};
+use common_version::{short_version, verbose_version};
 use meta_srv::bootstrap::MetasrvInstance;
 use meta_srv::metasrv::BackendImpl;
 use snafu::ResultExt;
@@ -30,7 +30,7 @@ use tracing_appender::non_blocking::WorkerGuard;

 use crate::error::{self, LoadLayeredConfigSnafu, Result, StartMetaServerSnafu};
 use crate::options::{GlobalOptions, GreptimeOptions};
-use crate::{create_resource_limit_metrics, log_versions, App};
+use crate::{create_resource_limit_metrics, log_versions, maybe_activate_heap_profile, App};

 type MetasrvOptions = GreptimeOptions<meta_srv::metasrv::MetasrvOptions>;

@@ -324,7 +324,8 @@ impl StartCommand {
            None,
        );

-        log_versions(version(), short_version(), APP_NAME);
+        log_versions(verbose_version(), short_version(), APP_NAME);
+        maybe_activate_heap_profile(&opts.component.memory);
        create_resource_limit_metrics(APP_NAME);

        info!("Metasrv start command: {:#?}", self);
@@ -340,12 +341,12 @@ impl StartCommand {
            .await
            .context(StartMetaServerSnafu)?;

-        let builder = meta_srv::bootstrap::metasrv_builder(&opts, plugins.clone(), None)
+        let builder = meta_srv::bootstrap::metasrv_builder(&opts, plugins, None)
            .await
            .context(error::BuildMetaServerSnafu)?;
        let metasrv = builder.build().await.context(error::BuildMetaServerSnafu)?;

-        let instance = MetasrvInstance::new(opts, plugins, metasrv)
+        let instance = MetasrvInstance::new(metasrv)
            .await
            .context(error::BuildMetaServerSnafu)?;

--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -34,24 +34,26 @@ use common_meta::cluster::{NodeInfo, NodeStatus};
 use common_meta::datanode::RegionStat;
 use common_meta::ddl::flow_meta::FlowMetadataAllocator;
 use common_meta::ddl::table_meta::TableMetadataAllocator;
-use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl, ProcedureExecutorRef};
+use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl};
 use common_meta::ddl_manager::DdlManager;
 use common_meta::key::flow::flow_state::FlowStat;
 use common_meta::key::flow::FlowMetadataManager;
 use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::peer::Peer;
+use common_meta::procedure_executor::LocalProcedureExecutor;
 use common_meta::region_keeper::MemoryRegionKeeper;
 use common_meta::region_registry::LeaderRegionRegistry;
 use common_meta::sequence::SequenceBuilder;
 use common_meta::wal_options_allocator::{build_wal_options_allocator, WalOptionsAllocatorRef};
+use common_options::memory::MemoryOptions;
 use common_procedure::{ProcedureInfo, ProcedureManagerRef};
 use common_telemetry::info;
 use common_telemetry::logging::{
    LoggingOptions, SlowQueryOptions, TracingOptions, DEFAULT_LOGGING_DIR,
 };
 use common_time::timezone::set_default_timezone;
-use common_version::{short_version, version};
+use common_version::{short_version, verbose_version};
 use common_wal::config::DatanodeWalConfig;
 use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, StorageConfig};
 use datanode::datanode::{Datanode, DatanodeBuilder};
@@ -83,7 +85,7 @@ use tracing_appender::non_blocking::WorkerGuard;

 use crate::error::{Result, StartFlownodeSnafu};
 use crate::options::{GlobalOptions, GreptimeOptions};
-use crate::{create_resource_limit_metrics, error, log_versions, App};
+use crate::{create_resource_limit_metrics, error, log_versions, maybe_activate_heap_profile, App};

 pub const APP_NAME: &str = "greptime-standalone";

@@ -157,6 +159,7 @@ pub struct StandaloneOptions {
    pub max_in_flight_write_bytes: Option<ReadableSize>,
    pub slow_query: Option<SlowQueryOptions>,
    pub query: QueryOptions,
+    pub memory: MemoryOptions,
 }

 impl Default for StandaloneOptions {
@@ -190,6 +193,7 @@ impl Default for StandaloneOptions {
            max_in_flight_write_bytes: None,
            slow_query: Some(SlowQueryOptions::default()),
            query: QueryOptions::default(),
+            memory: MemoryOptions::default(),
        }
    }
 }
@@ -485,7 +489,8 @@ impl StartCommand {
            opts.component.slow_query.as_ref(),
        );

-        log_versions(version(), short_version(), APP_NAME);
+        log_versions(verbose_version(), short_version(), APP_NAME);
+        maybe_activate_heap_profile(&opts.component.memory);
        create_resource_limit_metrics(APP_NAME);

        info!("Standalone start command: {:#?}", self);
@@ -636,9 +641,8 @@ impl StartCommand {
            flow_metadata_allocator: flow_metadata_allocator.clone(),
            region_failure_detector_controller: Arc::new(NoopRegionFailureDetectorControl),
        };
-        let procedure_manager_c = procedure_manager.clone();

-        let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager_c, true)
+        let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager.clone(), true)
            .context(error::InitDdlManagerSnafu)?;
        #[cfg(feature = "enterprise")]
        let ddl_manager = {
@@ -646,7 +650,11 @@ impl StartCommand {
                plugins.get();
            ddl_manager.with_trigger_ddl_manager(trigger_ddl_manager)
        };
-        let ddl_task_executor: ProcedureExecutorRef = Arc::new(ddl_manager);
+
+        let procedure_executor = Arc::new(LocalProcedureExecutor::new(
+            Arc::new(ddl_manager),
+            procedure_manager.clone(),
+        ));

        let fe_instance = FrontendBuilder::new(
            fe_opts.clone(),
@@ -654,7 +662,7 @@ impl StartCommand {
            layered_cache_registry.clone(),
            catalog_manager.clone(),
            node_manager.clone(),
-            ddl_task_executor.clone(),
+            procedure_executor.clone(),
            process_manager,
        )
        .with_plugin(plugins.clone())
@@ -679,7 +687,7 @@ impl StartCommand {
            catalog_manager.clone(),
            kv_backend.clone(),
            layered_cache_registry.clone(),
-            ddl_task_executor.clone(),
+            procedure_executor,
            node_manager,
        )
        .await
@@ -821,6 +829,7 @@ impl InformationExtension for StandaloneInformationExtension {
                    memtable_size: region_stat.memtable_size,
                    manifest_size: region_stat.manifest_size,
                    sst_size: region_stat.sst_size,
+                    sst_num: region_stat.sst_num,
                    index_size: region_stat.index_size,
                    region_manifest: region_stat.manifest.into(),
                    data_topic_latest_entry_id: region_stat.data_topic_latest_entry_id,
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -34,6 +34,7 @@ use query::options::QueryOptions;
 use servers::export_metrics::ExportMetricsOption;
 use servers::grpc::GrpcOptions;
 use servers::http::HttpOptions;
+use servers::tls::{TlsMode, TlsOption};
 use store_api::path_utils::WAL_DIR;

 #[allow(deprecated)]
@@ -190,6 +191,13 @@ fn test_load_metasrv_example_config() {
                remote_write: Some(Default::default()),
                ..Default::default()
            },
+            backend_tls: Some(TlsOption {
+                mode: TlsMode::Prefer,
+                cert_path: String::new(),
+                key_path: String::new(),
+                ca_cert_path: String::new(),
+                watch: false,
+            }),
            ..Default::default()
        },
        ..Default::default()
@@ -225,7 +233,10 @@ fn test_load_flownode_example_config() {
            heartbeat: Default::default(),
            // flownode deliberately use a slower query parallelism
            // to avoid overwhelming the frontend with too many queries
-            query: QueryOptions { parallelism: 1 },
+            query: QueryOptions {
+                parallelism: 1,
+                allow_query_fallback: false,
+            },
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
@@ -242,6 +253,7 @@ fn test_load_flownode_example_config() {
                ..Default::default()
            },
            user_provider: None,
+            memory: Default::default(),
        },
        ..Default::default()
    };
@@ -295,6 +307,7 @@ fn test_load_standalone_example_config() {
                cors_allowed_origins: vec!["https://example.com".to_string()],
                ..Default::default()
            },
+
            ..Default::default()
        },
        ..Default::default()
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -104,8 +104,6 @@ pub const INFORMATION_SCHEMA_PROCEDURE_INFO_TABLE_ID: u32 = 34;
 pub const INFORMATION_SCHEMA_REGION_STATISTICS_TABLE_ID: u32 = 35;
 /// id for information_schema.process_list
 pub const INFORMATION_SCHEMA_PROCESS_LIST_TABLE_ID: u32 = 36;
-/// id for information_schema.trigger_list (for greptimedb trigger)
-pub const INFORMATION_SCHEMA_TRIGGER_TABLE_ID: u32 = 37;

 // ----- End of information_schema tables -----

--- a/src/common/datasource/src/lib.rs
+++ b/src/common/datasource/src/lib.rs
@@ -21,6 +21,7 @@ pub mod error;
 pub mod file_format;
 pub mod lister;
 pub mod object_store;
+pub mod parquet_writer;
 pub mod share_buffer;
 #[cfg(test)]
 pub mod test_util;
--- a/src/common/datasource/src/object_store/oss.rs
+++ b/src/common/datasource/src/object_store/oss.rs
@@ -77,6 +77,11 @@ pub fn build_oss_backend(

    let op = ObjectStore::new(builder)
        .context(error::BuildBackendSnafu)?
+        .layer(
+            object_store::layers::RetryLayer::new()
+                .with_jitter()
+                .with_notify(object_store::util::PrintDetailedError),
+        )
        .layer(object_store::layers::LoggingLayer::default())
        .layer(object_store::layers::TracingLayer)
        .layer(object_store::layers::build_prometheus_metrics_layer(true))
--- a/src/common/datasource/src/object_store/s3.rs
+++ b/src/common/datasource/src/object_store/s3.rs
@@ -85,6 +85,11 @@ pub fn build_s3_backend(
    // TODO(weny): Consider finding a better way to eliminate duplicate code.
    Ok(ObjectStore::new(builder)
        .context(error::BuildBackendSnafu)?
+        .layer(
+            object_store::layers::RetryLayer::new()
+                .with_jitter()
+                .with_notify(object_store::util::PrintDetailedError),
+        )
        .layer(object_store::layers::LoggingLayer::new(
            DefaultLoggingInterceptor,
        ))
--- a/src/common/datasource/src/parquet_writer.rs
+++ b/src/common/datasource/src/parquet_writer.rs
@@ -0,0 +1,52 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use bytes::Bytes;
+use futures::future::BoxFuture;
+use object_store::Writer;
+use parquet::arrow::async_writer::AsyncFileWriter;
+use parquet::errors::ParquetError;
+
+/// Bridges opendal [Writer] with parquet [AsyncFileWriter].
+pub struct AsyncWriter {
+    inner: Writer,
+}
+
+impl AsyncWriter {
+    /// Create a [`AsyncWriter`] by given [`Writer`].
+    pub fn new(writer: Writer) -> Self {
+        Self { inner: writer }
+    }
+}
+
+impl AsyncFileWriter for AsyncWriter {
+    fn write(&mut self, bs: Bytes) -> BoxFuture<'_, parquet::errors::Result<()>> {
+        Box::pin(async move {
+            self.inner
+                .write(bs)
+                .await
+                .map_err(|err| ParquetError::External(Box::new(err)))
+        })
+    }
+
+    fn complete(&mut self) -> BoxFuture<'_, parquet::errors::Result<()>> {
+        Box::pin(async move {
+            self.inner
+                .close()
+                .await
+                .map(|_| ())
+                .map_err(|err| ParquetError::External(Box::new(err)))
+        })
+    }
+}
--- a/src/common/error/tests/ext.rs
+++ b/src/common/error/tests/ext.rs
@@ -84,24 +84,33 @@ fn test_to_string() {
    assert_eq!(result.unwrap_err().to_string(), "<root cause>");
 }

+fn normalize_path(s: &str) -> String {
+    s.replace('\\', "/")
+}
+
 #[test]
 fn test_debug_format() {
    let result = normal_error();
    let debug_output = format!("{:?}", result.unwrap_err());
-    let normalized_output = debug_output.replace('\\', "/");
+
    assert_eq!(
-        normalized_output,
-        r#"0: A normal error with "display" attribute, message "blabla", at src/common/error/tests/ext.rs:55:22
-1: PlainError { msg: "<root cause>", status_code: Unexpected }"#
+        normalize_path(&debug_output),
+        format!(
+            r#"0: A normal error with "display" attribute, message "blabla", at {}:55:22
+1: PlainError {{ msg: "<root cause>", status_code: Unexpected }}"#,
+            normalize_path(file!())
+        )
    );

    let result = transparent_error();
    let debug_output = format!("{:?}", result.unwrap_err());
-    let normalized_output = debug_output.replace('\\', "/");
    assert_eq!(
-        normalized_output,
-        r#"0: <transparent>, at src/common/error/tests/ext.rs:60:5
-1: PlainError { msg: "<root cause>", status_code: Unexpected }"#
+        normalize_path(&debug_output),
+        format!(
+            r#"0: <transparent>, at {}:60:5
+1: PlainError {{ msg: "<root cause>", status_code: Unexpected }}"#,
+            normalize_path(file!())
+        )
    );
 }

--- a/src/common/event-recorder/Cargo.toml
+++ b/src/common/event-recorder/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "common-event-recorder"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+api.workspace = true
+async-trait.workspace = true
+backon.workspace = true
+client.workspace = true
+common-error.workspace = true
+common-macro.workspace = true
+common-meta.workspace = true
+common-telemetry.workspace = true
+common-time.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+snafu.workspace = true
+store-api.workspace = true
+tokio.workspace = true
+tokio-util.workspace = true
+
+[lints]
+workspace = true
--- a/src/common/event-recorder/src/error.rs
+++ b/src/common/event-recorder/src/error.rs
@@ -0,0 +1,53 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::ColumnSchema;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("No available frontend"))]
+    NoAvailableFrontend {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Mismatched schema, expected: {:?}, actual: {:?}", expected, actual))]
+    MismatchedSchema {
+        #[snafu(implicit)]
+        location: Location,
+        expected: Vec<ColumnSchema>,
+        actual: Vec<ColumnSchema>,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Error::MismatchedSchema { .. } => StatusCode::InvalidArguments,
+            Error::NoAvailableFrontend { .. } => StatusCode::Internal,
+        }
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
--- a/src/common/event-recorder/src/lib.rs
+++ b/src/common/event-recorder/src/lib.rs
@@ -0,0 +1,18 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod error;
+pub mod recorder;
+
+pub use recorder::*;
--- a/src/common/event-recorder/src/recorder.rs
+++ b/src/common/event-recorder/src/recorder.rs
@@ -0,0 +1,527 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::sync::{Arc, OnceLock};
+use std::time::Duration;
+
+use api::v1::column_data_type_extension::TypeExt;
+use api::v1::value::ValueData;
+use api::v1::{
+    ColumnDataType, ColumnDataTypeExtension, ColumnSchema, JsonTypeExtension, Row,
+    RowInsertRequest, RowInsertRequests, Rows, SemanticType,
+};
+use async_trait::async_trait;
+use backon::{BackoffBuilder, ExponentialBuilder};
+use common_telemetry::{debug, error, info, warn};
+use common_time::timestamp::{TimeUnit, Timestamp};
+use serde::{Deserialize, Serialize};
+use store_api::mito_engine_options::{APPEND_MODE_KEY, TTL_KEY};
+use tokio::sync::mpsc::{channel, Receiver, Sender};
+use tokio::task::JoinHandle;
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+
+use crate::error::{MismatchedSchemaSnafu, Result};
+
+/// The default table name for storing the events.
+pub const DEFAULT_EVENTS_TABLE_NAME: &str = "events";
+
+/// The column name for the event type.
+pub const EVENTS_TABLE_TYPE_COLUMN_NAME: &str = "type";
+/// The column name for the event payload.
+pub const EVENTS_TABLE_PAYLOAD_COLUMN_NAME: &str = "payload";
+/// The column name for the event timestamp.
+pub const EVENTS_TABLE_TIMESTAMP_COLUMN_NAME: &str = "timestamp";
+
+/// EventRecorderRef is the reference to the event recorder.
+pub type EventRecorderRef = Arc<dyn EventRecorder>;
+
+static EVENTS_TABLE_TTL: OnceLock<String> = OnceLock::new();
+
+/// The time interval for flushing batched events to the event handler.
+pub const DEFAULT_FLUSH_INTERVAL_SECONDS: Duration = Duration::from_secs(5);
+// The default TTL for the events table.
+const DEFAULT_EVENTS_TABLE_TTL: &str = "30d";
+// The capacity of the tokio channel for transmitting events to background processor.
+const DEFAULT_CHANNEL_SIZE: usize = 2048;
+// The size of the buffer for batching events before flushing to event handler.
+const DEFAULT_BUFFER_SIZE: usize = 100;
+// The maximum number of retry attempts when event handler processing fails.
+const DEFAULT_MAX_RETRY_TIMES: u64 = 3;
+
+/// Event trait defines the interface for events that can be recorded and persisted as the system table.
+/// By default, the event will be persisted as the system table with the following schema:
+///
+/// - `type`: the type of the event.
+/// - `payload`: the JSON bytes of the event.
+/// - `timestamp`: the timestamp of the event.
+///
+/// The event can also add the extra schema and row to the event by overriding the `extra_schema` and `extra_row` methods.
+pub trait Event: Send + Sync + Debug {
+    /// Returns the type of the event.
+    fn event_type(&self) -> &str;
+
+    /// Returns the timestamp of the event. Default to the current time.
+    fn timestamp(&self) -> Timestamp {
+        Timestamp::current_time(TimeUnit::Nanosecond)
+    }
+
+    /// Returns the JSON bytes of the event as the payload. It will use JSON type to store the payload.
+    fn json_payload(&self) -> Result<String>;
+
+    /// Add the extra schema to the event with the default schema.
+    fn extra_schema(&self) -> Vec<ColumnSchema> {
+        vec![]
+    }
+
+    /// Add the extra row to the event with the default row.
+    fn extra_row(&self) -> Result<Row> {
+        Ok(Row { values: vec![] })
+    }
+
+    /// Returns the event as any type.
+    fn as_any(&self) -> &dyn Any;
+}
+
+/// Returns the hints for the insert operation.
+pub fn insert_hints() -> Vec<(&'static str, &'static str)> {
+    vec![
+        (
+            TTL_KEY,
+            EVENTS_TABLE_TTL
+                .get()
+                .map(|s| s.as_str())
+                .unwrap_or(DEFAULT_EVENTS_TABLE_TTL),
+        ),
+        (APPEND_MODE_KEY, "true"),
+    ]
+}
+
+/// Builds the row inserts request for the events that will be persisted to the events table.
+pub fn build_row_inserts_request(events: &[Box<dyn Event>]) -> Result<RowInsertRequests> {
+    // Aggregate the events by the event type.
+    let mut event_groups: HashMap<&str, Vec<&Box<dyn Event>>> = HashMap::new();
+
+    for event in events {
+        event_groups
+            .entry(event.event_type())
+            .or_default()
+            .push(event);
+    }
+
+    let mut row_insert_requests = RowInsertRequests {
+        inserts: Vec::with_capacity(event_groups.len()),
+    };
+
+    for (_, events) in event_groups {
+        validate_events(&events)?;
+
+        // We already validated the events, so it's safe to get the first event to build the schema for the RowInsertRequest.
+        let event = &events[0];
+        let mut schema = vec![
+            ColumnSchema {
+                column_name: EVENTS_TABLE_TYPE_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::String.into(),
+                semantic_type: SemanticType::Tag.into(),
+                ..Default::default()
+            },
+            ColumnSchema {
+                column_name: EVENTS_TABLE_PAYLOAD_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::Binary as i32,
+                semantic_type: SemanticType::Field as i32,
+                datatype_extension: Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
+                }),
+                ..Default::default()
+            },
+            ColumnSchema {
+                column_name: EVENTS_TABLE_TIMESTAMP_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::TimestampNanosecond.into(),
+                semantic_type: SemanticType::Timestamp.into(),
+                ..Default::default()
+            },
+        ];
+        schema.extend(event.extra_schema());
+
+        let rows = events
+            .iter()
+            .map(|event| {
+                let mut row = Row {
+                    values: vec![
+                        ValueData::StringValue(event.event_type().to_string()).into(),
+                        ValueData::BinaryValue(event.json_payload()?.as_bytes().to_vec()).into(),
+                        ValueData::TimestampNanosecondValue(event.timestamp().value()).into(),
+                    ],
+                };
+                row.values.extend(event.extra_row()?.values);
+                Ok(row)
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        row_insert_requests.inserts.push(RowInsertRequest {
+            table_name: DEFAULT_EVENTS_TABLE_NAME.to_string(),
+            rows: Some(Rows { schema, rows }),
+        });
+    }
+
+    Ok(row_insert_requests)
+}
+
+// Ensure the events with the same event type have the same extra schema.
+#[allow(clippy::borrowed_box)]
+fn validate_events(events: &[&Box<dyn Event>]) -> Result<()> {
+    // It's safe to get the first event because the events are already grouped by the event type.
+    let extra_schema = events[0].extra_schema();
+    for event in events {
+        if event.extra_schema() != extra_schema {
+            MismatchedSchemaSnafu {
+                expected: extra_schema.clone(),
+                actual: event.extra_schema(),
+            }
+            .fail()?;
+        }
+    }
+    Ok(())
+}
+
+/// EventRecorder trait defines the interface for recording events.
+pub trait EventRecorder: Send + Sync + 'static {
+    /// Records an event for persistence and processing by [EventHandler].
+    fn record(&self, event: Box<dyn Event>);
+
+    /// Cancels the event recorder.
+    fn close(&self);
+}
+
+/// EventHandler trait defines the interface for how to handle the event.
+#[async_trait]
+pub trait EventHandler: Send + Sync + 'static {
+    /// Processes and handles incoming events. The [DefaultEventHandlerImpl] implementation forwards events to frontend instances for persistence.
+    /// We use `&[Box<dyn Event>]` to avoid consuming the events, so the caller can buffer the events and retry if the handler fails.
+    async fn handle(&self, events: &[Box<dyn Event>]) -> Result<()>;
+}
+
+/// Configuration options for the event recorder.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct EventRecorderOptions {
+    /// TTL for the events table that will be used to store the events.
+    pub ttl: String,
+}
+
+impl Default for EventRecorderOptions {
+    fn default() -> Self {
+        Self {
+            ttl: DEFAULT_EVENTS_TABLE_TTL.to_string(),
+        }
+    }
+}
+
+/// Implementation of [EventRecorder] that records the events and processes them in the background by the [EventHandler].
+pub struct EventRecorderImpl {
+    // The channel to send the events to the background processor.
+    tx: Sender<Box<dyn Event>>,
+    // The cancel token to cancel the background processor.
+    cancel_token: CancellationToken,
+    // The background processor to process the events.
+    handle: Option<JoinHandle<()>>,
+}
+
+impl EventRecorderImpl {
+    pub fn new(event_handler: Box<dyn EventHandler>, opts: EventRecorderOptions) -> Self {
+        info!("Creating event recorder with options: {:?}", opts);
+
+        let (tx, rx) = channel(DEFAULT_CHANNEL_SIZE);
+        let cancel_token = CancellationToken::new();
+
+        let mut recorder = Self {
+            tx,
+            handle: None,
+            cancel_token: cancel_token.clone(),
+        };
+
+        let processor = EventProcessor::new(
+            rx,
+            event_handler,
+            DEFAULT_FLUSH_INTERVAL_SECONDS,
+            DEFAULT_MAX_RETRY_TIMES,
+        )
+        .with_cancel_token(cancel_token);
+
+        // Spawn a background task to process the events.
+        let handle = tokio::spawn(async move {
+            processor.process(DEFAULT_BUFFER_SIZE).await;
+        });
+
+        recorder.handle = Some(handle);
+
+        // It only sets the ttl once, so it's safe to skip the error.
+        if EVENTS_TABLE_TTL.set(opts.ttl.clone()).is_err() {
+            info!(
+                "Events table ttl already set to {}, skip setting it",
+                opts.ttl
+            );
+        }
+
+        recorder
+    }
+}
+
+impl EventRecorder for EventRecorderImpl {
+    // Accepts an event and send it to the background handler.
+    fn record(&self, event: Box<dyn Event>) {
+        if let Err(e) = self.tx.try_send(event) {
+            error!("Failed to send event to the background processor: {}", e);
+        }
+    }
+
+    // Closes the event recorder. It will stop the background processor and flush the buffer.
+    fn close(&self) {
+        self.cancel_token.cancel();
+    }
+}
+
+impl Drop for EventRecorderImpl {
+    fn drop(&mut self) {
+        if let Some(handle) = self.handle.take() {
+            handle.abort();
+            info!("Aborted the background processor in event recorder");
+        }
+    }
+}
+
+struct EventProcessor {
+    rx: Receiver<Box<dyn Event>>,
+    event_handler: Box<dyn EventHandler>,
+    max_retry_times: u64,
+    process_interval: Duration,
+    cancel_token: CancellationToken,
+}
+
+impl EventProcessor {
+    fn new(
+        rx: Receiver<Box<dyn Event>>,
+        event_handler: Box<dyn EventHandler>,
+        process_interval: Duration,
+        max_retry_times: u64,
+    ) -> Self {
+        Self {
+            rx,
+            event_handler,
+            max_retry_times,
+            process_interval,
+            cancel_token: CancellationToken::new(),
+        }
+    }
+
+    fn with_cancel_token(mut self, cancel_token: CancellationToken) -> Self {
+        self.cancel_token = cancel_token;
+        self
+    }
+
+    async fn process(mut self, buffer_size: usize) {
+        info!("Start the background processor in event recorder to handle the received events.");
+
+        let mut buffer = Vec::with_capacity(buffer_size);
+        let mut interval = tokio::time::interval(self.process_interval);
+
+        loop {
+            tokio::select! {
+                maybe_event = self.rx.recv() => {
+                    if let Some(maybe_event) = maybe_event {
+                        debug!("Received event: {:?}", maybe_event);
+
+                        if buffer.len() >= buffer_size {
+                            debug!(
+                                "Flushing events to the event handler because the buffer is full with {} events",
+                                buffer.len()
+                            );
+                            self.flush_events_to_handler(&mut buffer).await;
+                        }
+
+                        // Push the event to the buffer, the buffer will be flushed when the interval is triggered or received a closed signal.
+                        buffer.push(maybe_event);
+                    } else {
+                        // When received a closed signal, flush the buffer and exit the loop.
+                        self.flush_events_to_handler(&mut buffer).await;
+                        break;
+                    }
+                }
+                // Cancel the processor through the cancel token.
+                _ = self.cancel_token.cancelled() => {
+                    warn!("Received a cancel signal, flushing the buffer and exiting the loop");
+                    self.flush_events_to_handler(&mut buffer).await;
+                    break;
+                }
+                // When the interval is triggered, flush the buffer and send the events to the event handler.
+                _ = interval.tick() => {
+                    self.flush_events_to_handler(&mut buffer).await;
+                }
+            }
+        }
+    }
+
+    // NOTE: While we implement a retry mechanism for failed event handling, there is no guarantee that all events will be processed successfully.
+    async fn flush_events_to_handler(&self, buffer: &mut Vec<Box<dyn Event>>) {
+        if !buffer.is_empty() {
+            debug!("Flushing {} events to the event handler", buffer.len());
+
+            let mut backoff = ExponentialBuilder::default()
+                .with_min_delay(Duration::from_millis(
+                    DEFAULT_FLUSH_INTERVAL_SECONDS.as_millis() as u64 / self.max_retry_times.max(1),
+                ))
+                .with_max_delay(Duration::from_millis(
+                    DEFAULT_FLUSH_INTERVAL_SECONDS.as_millis() as u64,
+                ))
+                .with_max_times(self.max_retry_times as usize)
+                .build();
+
+            loop {
+                match self.event_handler.handle(buffer).await {
+                    Ok(()) => {
+                        debug!("Successfully handled {} events", buffer.len());
+                        break;
+                    }
+                    Err(e) => {
+                        if let Some(d) = backoff.next() {
+                            warn!(e; "Failed to handle events, retrying...");
+                            sleep(d).await;
+                            continue;
+                        } else {
+                            warn!(
+                                e; "Failed to handle events after {} retries",
+                                self.max_retry_times
+                            );
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        // Clear the buffer to prevent unbounded memory growth, regardless of whether event processing succeeded or failed.
+        buffer.clear();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Debug)]
+    struct TestEvent {}
+
+    impl Event for TestEvent {
+        fn event_type(&self) -> &str {
+            "test_event"
+        }
+
+        fn json_payload(&self) -> Result<String> {
+            Ok("{\"procedure_id\": \"1234567890\"}".to_string())
+        }
+
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+    }
+
+    struct TestEventHandlerImpl {}
+
+    #[async_trait]
+    impl EventHandler for TestEventHandlerImpl {
+        async fn handle(&self, events: &[Box<dyn Event>]) -> Result<()> {
+            let event = events
+                .first()
+                .unwrap()
+                .as_any()
+                .downcast_ref::<TestEvent>()
+                .unwrap();
+            assert_eq!(
+                event.json_payload().unwrap(),
+                "{\"procedure_id\": \"1234567890\"}"
+            );
+            assert_eq!(event.event_type(), "test_event");
+            Ok(())
+        }
+    }
+
+    #[tokio::test]
+    async fn test_event_recorder() {
+        let mut event_recorder = EventRecorderImpl::new(
+            Box::new(TestEventHandlerImpl {}),
+            EventRecorderOptions::default(),
+        );
+        event_recorder.record(Box::new(TestEvent {}));
+
+        // Sleep for a while to let the event be sent to the event handler.
+        sleep(Duration::from_millis(500)).await;
+
+        // Close the event recorder to flush the buffer.
+        event_recorder.close();
+
+        // Sleep for a while to let the background task process the event.
+        sleep(Duration::from_millis(500)).await;
+
+        if let Some(handle) = event_recorder.handle.take() {
+            assert!(handle.await.is_ok());
+        }
+    }
+
+    struct TestEventHandlerImplShouldPanic {}
+
+    #[async_trait]
+    impl EventHandler for TestEventHandlerImplShouldPanic {
+        async fn handle(&self, events: &[Box<dyn Event>]) -> Result<()> {
+            let event = events
+                .first()
+                .unwrap()
+                .as_any()
+                .downcast_ref::<TestEvent>()
+                .unwrap();
+
+            // Set the incorrect payload and event type to trigger the panic.
+            assert_eq!(
+                event.json_payload().unwrap(),
+                "{\"procedure_id\": \"should_panic\"}"
+            );
+            assert_eq!(event.event_type(), "should_panic");
+            Ok(())
+        }
+    }
+
+    #[tokio::test]
+    async fn test_event_recorder_should_panic() {
+        let mut event_recorder = EventRecorderImpl::new(
+            Box::new(TestEventHandlerImplShouldPanic {}),
+            EventRecorderOptions::default(),
+        );
+
+        event_recorder.record(Box::new(TestEvent {}));
+
+        // Sleep for a while to let the event be sent to the event handler.
+        sleep(Duration::from_millis(500)).await;
+
+        // Close the event recorder to flush the buffer.
+        event_recorder.close();
+
+        // Sleep for a while to let the background task process the event.
+        sleep(Duration::from_millis(500)).await;
+
+        if let Some(handle) = event_recorder.handle.take() {
+            assert!(handle.await.unwrap_err().is_panic());
+        }
+    }
+}
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -16,6 +16,8 @@ geo = ["geohash", "h3o", "s2", "wkt", "geo-types", "dep:geo"]
 ahash.workspace = true
 api.workspace = true
 arc-swap = "1.0"
+arrow.workspace = true
+arrow-schema.workspace = true
 async-trait.workspace = true
 bincode = "1.3"
 catalog.workspace = true
@@ -34,6 +36,7 @@ datafusion.workspace = true
 datafusion-common.workspace = true
 datafusion-expr.workspace = true
 datafusion-functions-aggregate-common.workspace = true
+datafusion-physical-expr.workspace = true
 datatypes.workspace = true
 derive_more = { version = "1", default-features = false, features = ["display"] }
 geo = { version = "0.29", optional = true }
@@ -62,5 +65,7 @@ wkt = { version = "0.11", optional = true }

 [dev-dependencies]
 approx = "0.5"
+futures.workspace = true
+pretty_assertions = "1.4.0"
 serde = { version = "1.0", features = ["derive"] }
 tokio.workspace = true
--- a/src/common/function/src/aggrs.rs
+++ b/src/common/function/src/aggrs.rs
@@ -17,3 +17,5 @@ pub mod count_hash;
 #[cfg(feature = "geo")]
 pub mod geo;
 pub mod vector;
+
+pub mod aggr_wrapper;
--- a/src/common/function/src/aggrs/aggr_wrapper.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper.rs
@@ -0,0 +1,538 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Wrapper for making aggregate functions out of state/merge functions of original aggregate functions.
+//!
+//! i.e. for a aggregate function `foo`, we will have a state function `foo_state` and a merge function `foo_merge`.
+//!
+//! `foo_state`'s input args is the same as `foo`'s, and its output is a state object.
+//! Note that `foo_state` might have multiple output columns, so it's a struct array
+//! that each output column is a struct field.
+//! `foo_merge`'s input arg is the same as `foo_state`'s output, and its output is the same as `foo`'s input.
+//!
+
+use std::sync::Arc;
+
+use arrow::array::StructArray;
+use arrow_schema::Fields;
+use datafusion::optimizer::analyzer::type_coercion::TypeCoercion;
+use datafusion::optimizer::AnalyzerRule;
+use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter;
+use datafusion_common::{Column, ScalarValue};
+use datafusion_expr::expr::AggregateFunction;
+use datafusion_expr::function::StateFieldsArgs;
+use datafusion_expr::{
+    Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, Expr, ExprSchemable, LogicalPlan,
+    Signature,
+};
+use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
+use datatypes::arrow::datatypes::{DataType, Field};
+
+/// Returns the name of the state function for the given aggregate function name.
+/// The state function is used to compute the state of the aggregate function.
+/// The state function's name is in the format `__<aggr_name>_state
+pub fn aggr_state_func_name(aggr_name: &str) -> String {
+    format!("__{}_state", aggr_name)
+}
+
+/// Returns the name of the merge function for the given aggregate function name.
+/// The merge function is used to merge the states of the state functions.
+/// The merge function's name is in the format `__<aggr_name>_merge
+pub fn aggr_merge_func_name(aggr_name: &str) -> String {
+    format!("__{}_merge", aggr_name)
+}
+
+/// A wrapper to make an aggregate function out of the state and merge functions of the original aggregate function.
+/// It contains the original aggregate function, the state functions, and the merge function.
+///
+/// Notice state functions may have multiple output columns, so it's return type is always a struct array, and the merge function is used to merge the states of the state functions.
+#[derive(Debug, Clone)]
+pub struct StateMergeHelper;
+
+/// A struct to hold the two aggregate plans, one for the state function(lower) and one for the merge function(upper).
+#[allow(unused)]
+#[derive(Debug, Clone)]
+pub struct StepAggrPlan {
+    /// Upper merge plan, which is the aggregate plan that merges the states of the state function.
+    pub upper_merge: Arc<LogicalPlan>,
+    /// Lower state plan, which is the aggregate plan that computes the state of the aggregate function.
+    pub lower_state: Arc<LogicalPlan>,
+}
+
+pub fn get_aggr_func(expr: &Expr) -> Option<&datafusion_expr::expr::AggregateFunction> {
+    let mut expr_ref = expr;
+    while let Expr::Alias(alias) = expr_ref {
+        expr_ref = &alias.expr;
+    }
+    if let Expr::AggregateFunction(aggr_func) = expr_ref {
+        Some(aggr_func)
+    } else {
+        None
+    }
+}
+
+impl StateMergeHelper {
+    /// Split an aggregate plan into two aggregate plans, one for the state function and one for the merge function.
+    pub fn split_aggr_node(aggr_plan: Aggregate) -> datafusion_common::Result<StepAggrPlan> {
+        let aggr = {
+            // certain aggr func need type coercion to work correctly, so we need to analyze the plan first.
+            let aggr_plan = TypeCoercion::new().analyze(
+                LogicalPlan::Aggregate(aggr_plan).clone(),
+                &Default::default(),
+            )?;
+            if let LogicalPlan::Aggregate(aggr) = aggr_plan {
+                aggr
+            } else {
+                return Err(datafusion_common::DataFusionError::Internal(format!(
+                    "Failed to coerce expressions in aggregate plan, expected Aggregate, got: {:?}",
+                    aggr_plan
+                )));
+            }
+        };
+        let mut lower_aggr_exprs = vec![];
+        let mut upper_aggr_exprs = vec![];
+
+        for aggr_expr in aggr.aggr_expr.iter() {
+            let Some(aggr_func) = get_aggr_func(aggr_expr) else {
+                return Err(datafusion_common::DataFusionError::NotImplemented(format!(
+                    "Unsupported aggregate expression for step aggr optimize: {:?}",
+                    aggr_expr
+                )));
+            };
+
+            let original_input_types = aggr_func
+                .args
+                .iter()
+                .map(|e| e.get_type(&aggr.input.schema()))
+                .collect::<Result<Vec<_>, _>>()?;
+
+            // first create the state function from the original aggregate function.
+            let state_func = StateWrapper::new((*aggr_func.func).clone())?;
+
+            let expr = AggregateFunction {
+                func: Arc::new(state_func.into()),
+                args: aggr_func.args.clone(),
+                distinct: aggr_func.distinct,
+                filter: aggr_func.filter.clone(),
+                order_by: aggr_func.order_by.clone(),
+                null_treatment: aggr_func.null_treatment,
+            };
+            let expr = Expr::AggregateFunction(expr);
+            let lower_state_output_col_name = expr.schema_name().to_string();
+
+            lower_aggr_exprs.push(expr);
+
+            let (original_phy_expr, _filter, _ordering) = create_aggregate_expr_and_maybe_filter(
+                aggr_expr,
+                aggr.input.schema(),
+                aggr.input.schema().as_arrow(),
+                &Default::default(),
+            )?;
+
+            let merge_func = MergeWrapper::new(
+                (*aggr_func.func).clone(),
+                original_phy_expr,
+                original_input_types,
+            )?;
+            let arg = Expr::Column(Column::new_unqualified(lower_state_output_col_name));
+            let expr = AggregateFunction {
+                func: Arc::new(merge_func.into()),
+                args: vec![arg],
+                distinct: aggr_func.distinct,
+                filter: aggr_func.filter.clone(),
+                order_by: aggr_func.order_by.clone(),
+                null_treatment: aggr_func.null_treatment,
+            };
+
+            // alias to the original aggregate expr's schema name, so parent plan can refer to it
+            // correctly.
+            let expr = Expr::AggregateFunction(expr).alias(aggr_expr.schema_name().to_string());
+            upper_aggr_exprs.push(expr);
+        }
+
+        let mut lower = aggr.clone();
+        lower.aggr_expr = lower_aggr_exprs;
+        let lower_plan = LogicalPlan::Aggregate(lower);
+
+        // update aggregate's output schema
+        let lower_plan = Arc::new(lower_plan.recompute_schema()?);
+
+        let mut upper = aggr.clone();
+        let aggr_plan = LogicalPlan::Aggregate(aggr);
+        upper.aggr_expr = upper_aggr_exprs;
+        upper.input = lower_plan.clone();
+        // upper schema's output schema should be the same as the original aggregate plan's output schema
+        let upper_check = upper.clone();
+        let upper_plan = Arc::new(LogicalPlan::Aggregate(upper_check).recompute_schema()?);
+        if *upper_plan.schema() != *aggr_plan.schema() {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                 "Upper aggregate plan's schema is not the same as the original aggregate plan's schema: \n[transformed]:{}\n[   original]{}",
+                upper_plan.schema(), aggr_plan.schema()
+            )));
+        }
+
+        Ok(StepAggrPlan {
+            lower_state: lower_plan,
+            upper_merge: upper_plan,
+        })
+    }
+}
+
+/// Wrapper to make an aggregate function out of a state function.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct StateWrapper {
+    inner: AggregateUDF,
+    name: String,
+}
+
+impl StateWrapper {
+    /// `state_index`: The index of the state in the output of the state function.
+    pub fn new(inner: AggregateUDF) -> datafusion_common::Result<Self> {
+        let name = aggr_state_func_name(inner.name());
+        Ok(Self { inner, name })
+    }
+
+    pub fn inner(&self) -> &AggregateUDF {
+        &self.inner
+    }
+
+    /// Deduce the return type of the original aggregate function
+    /// based on the accumulator arguments.
+    ///
+    pub fn deduce_aggr_return_type(
+        &self,
+        acc_args: &datafusion_expr::function::AccumulatorArgs,
+    ) -> datafusion_common::Result<DataType> {
+        let input_exprs = acc_args.exprs;
+        let input_schema = acc_args.schema;
+        let input_types = input_exprs
+            .iter()
+            .map(|e| e.data_type(input_schema))
+            .collect::<Result<Vec<_>, _>>()?;
+        let return_type = self.inner.return_type(&input_types)?;
+        Ok(return_type)
+    }
+}
+
+impl AggregateUDFImpl for StateWrapper {
+    fn accumulator<'a, 'b>(
+        &'a self,
+        acc_args: datafusion_expr::function::AccumulatorArgs<'b>,
+    ) -> datafusion_common::Result<Box<dyn Accumulator>> {
+        // fix and recover proper acc args for the original aggregate function.
+        let state_type = acc_args.return_type.clone();
+        let inner = {
+            let old_return_type = self.deduce_aggr_return_type(&acc_args)?;
+            let acc_args = datafusion_expr::function::AccumulatorArgs {
+                return_type: &old_return_type,
+                schema: acc_args.schema,
+                ignore_nulls: acc_args.ignore_nulls,
+                ordering_req: acc_args.ordering_req,
+                is_reversed: acc_args.is_reversed,
+                name: acc_args.name,
+                is_distinct: acc_args.is_distinct,
+                exprs: acc_args.exprs,
+            };
+            self.inner.accumulator(acc_args)?
+        };
+        Ok(Box::new(StateAccum::new(inner, state_type)?))
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+    fn name(&self) -> &str {
+        self.name.as_str()
+    }
+
+    fn is_nullable(&self) -> bool {
+        self.inner.is_nullable()
+    }
+
+    /// Return state_fields as the output struct type.
+    ///
+    fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        let old_return_type = self.inner.return_type(arg_types)?;
+        let state_fields_args = StateFieldsArgs {
+            name: self.inner().name(),
+            input_types: arg_types,
+            return_type: &old_return_type,
+            // TODO(discord9): how to get this?, probably ok?
+            ordering_fields: &[],
+            is_distinct: false,
+        };
+        let state_fields = self.inner.state_fields(state_fields_args)?;
+        let struct_field = DataType::Struct(state_fields.into());
+        Ok(struct_field)
+    }
+
+    /// The state function's output fields are the same as the original aggregate function's state fields.
+    fn state_fields(
+        &self,
+        args: datafusion_expr::function::StateFieldsArgs,
+    ) -> datafusion_common::Result<Vec<Field>> {
+        let old_return_type = self.inner.return_type(args.input_types)?;
+        let state_fields_args = StateFieldsArgs {
+            name: args.name,
+            input_types: args.input_types,
+            return_type: &old_return_type,
+            ordering_fields: args.ordering_fields,
+            is_distinct: args.is_distinct,
+        };
+        self.inner.state_fields(state_fields_args)
+    }
+
+    /// The state function's signature is the same as the original aggregate function's signature,
+    fn signature(&self) -> &Signature {
+        self.inner.signature()
+    }
+
+    /// Coerce types also do nothing, as optimzer should be able to already make struct types
+    fn coerce_types(&self, arg_types: &[DataType]) -> datafusion_common::Result<Vec<DataType>> {
+        self.inner.coerce_types(arg_types)
+    }
+}
+
+/// The wrapper's input is the same as the original aggregate function's input,
+/// and the output is the state function's output.
+#[derive(Debug)]
+pub struct StateAccum {
+    inner: Box<dyn Accumulator>,
+    state_fields: Fields,
+}
+
+impl StateAccum {
+    pub fn new(
+        inner: Box<dyn Accumulator>,
+        state_type: DataType,
+    ) -> datafusion_common::Result<Self> {
+        let DataType::Struct(fields) = state_type else {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                "Expected a struct type for state, got: {:?}",
+                state_type
+            )));
+        };
+        Ok(Self {
+            inner,
+            state_fields: fields,
+        })
+    }
+}
+
+impl Accumulator for StateAccum {
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        let state = self.inner.state()?;
+
+        let array = state
+            .iter()
+            .map(|s| s.to_array())
+            .collect::<Result<Vec<_>, _>>()?;
+        let struct_array = StructArray::try_new(self.state_fields.clone(), array, None)?;
+        Ok(ScalarValue::Struct(Arc::new(struct_array)))
+    }
+
+    fn merge_batch(
+        &mut self,
+        states: &[datatypes::arrow::array::ArrayRef],
+    ) -> datafusion_common::Result<()> {
+        self.inner.merge_batch(states)
+    }
+
+    fn update_batch(
+        &mut self,
+        values: &[datatypes::arrow::array::ArrayRef],
+    ) -> datafusion_common::Result<()> {
+        self.inner.update_batch(values)
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size()
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        self.inner.state()
+    }
+}
+
+/// TODO(discord9): mark this function as non-ser/de able
+///
+/// This wrapper shouldn't be register as a udaf, as it contain extra data that is not serializable.
+/// and changes for different logical plans.
+#[derive(Debug, Clone)]
+pub struct MergeWrapper {
+    inner: AggregateUDF,
+    name: String,
+    merge_signature: Signature,
+    /// The original physical expression of the aggregate function, can't store the original aggregate function directly, as PhysicalExpr didn't implement Any
+    original_phy_expr: Arc<AggregateFunctionExpr>,
+    original_input_types: Vec<DataType>,
+}
+impl MergeWrapper {
+    pub fn new(
+        inner: AggregateUDF,
+        original_phy_expr: Arc<AggregateFunctionExpr>,
+        original_input_types: Vec<DataType>,
+    ) -> datafusion_common::Result<Self> {
+        let name = aggr_merge_func_name(inner.name());
+        // the input type is actually struct type, which is the state fields of the original aggregate function.
+        let merge_signature = Signature::user_defined(datafusion_expr::Volatility::Immutable);
+
+        Ok(Self {
+            inner,
+            name,
+            merge_signature,
+            original_phy_expr,
+            original_input_types,
+        })
+    }
+
+    pub fn inner(&self) -> &AggregateUDF {
+        &self.inner
+    }
+}
+
+impl AggregateUDFImpl for MergeWrapper {
+    fn accumulator<'a, 'b>(
+        &'a self,
+        acc_args: datafusion_expr::function::AccumulatorArgs<'b>,
+    ) -> datafusion_common::Result<Box<dyn Accumulator>> {
+        if acc_args.schema.fields().len() != 1
+            || !matches!(acc_args.schema.field(0).data_type(), DataType::Struct(_))
+        {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                "Expected one struct type as input, got: {:?}",
+                acc_args.schema
+            )));
+        }
+        let input_type = acc_args.schema.field(0).data_type();
+        let DataType::Struct(fields) = input_type else {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                "Expected a struct type for input, got: {:?}",
+                input_type
+            )));
+        };
+
+        let inner_accum = self.original_phy_expr.create_accumulator()?;
+        Ok(Box::new(MergeAccum::new(inner_accum, fields)))
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+    fn name(&self) -> &str {
+        self.name.as_str()
+    }
+
+    fn is_nullable(&self) -> bool {
+        self.inner.is_nullable()
+    }
+
+    /// Notice here the `arg_types` is actually the `state_fields`'s data types,
+    /// so return fixed return type instead of using `arg_types` to determine the return type.
+    fn return_type(&self, _arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        // The return type is the same as the original aggregate function's return type.
+        let ret_type = self.inner.return_type(&self.original_input_types)?;
+        Ok(ret_type)
+    }
+    fn signature(&self) -> &Signature {
+        &self.merge_signature
+    }
+
+    /// Coerce types also do nothing, as optimzer should be able to already make struct types
+    fn coerce_types(&self, arg_types: &[DataType]) -> datafusion_common::Result<Vec<DataType>> {
+        // just check if the arg_types are only one and is struct array
+        if arg_types.len() != 1 || !matches!(arg_types.first(), Some(DataType::Struct(_))) {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                "Expected one struct type as input, got: {:?}",
+                arg_types
+            )));
+        }
+        Ok(arg_types.to_vec())
+    }
+
+    /// Just return the original aggregate function's state fields.
+    fn state_fields(
+        &self,
+        _args: datafusion_expr::function::StateFieldsArgs,
+    ) -> datafusion_common::Result<Vec<Field>> {
+        self.original_phy_expr.state_fields()
+    }
+}
+
+/// The merge accumulator, which modify `update_batch`'s behavior to accept one struct array which
+/// include the state fields of original aggregate function, and merge said states into original accumulator
+/// the output is the same as original aggregate function
+#[derive(Debug)]
+pub struct MergeAccum {
+    inner: Box<dyn Accumulator>,
+    state_fields: Fields,
+}
+
+impl MergeAccum {
+    pub fn new(inner: Box<dyn Accumulator>, state_fields: &Fields) -> Self {
+        Self {
+            inner,
+            state_fields: state_fields.clone(),
+        }
+    }
+}
+
+impl Accumulator for MergeAccum {
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        self.inner.evaluate()
+    }
+
+    fn merge_batch(&mut self, states: &[arrow::array::ArrayRef]) -> datafusion_common::Result<()> {
+        self.inner.merge_batch(states)
+    }
+
+    fn update_batch(&mut self, values: &[arrow::array::ArrayRef]) -> datafusion_common::Result<()> {
+        let value = values.first().ok_or_else(|| {
+            datafusion_common::DataFusionError::Internal("No values provided for merge".to_string())
+        })?;
+        // The input values are states from other accumulators, so we merge them.
+        let struct_arr = value
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .ok_or_else(|| {
+                datafusion_common::DataFusionError::Internal(format!(
+                    "Expected StructArray, got: {:?}",
+                    value.data_type()
+                ))
+            })?;
+        let fields = struct_arr.fields();
+        if fields != &self.state_fields {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                "Expected state fields: {:?}, got: {:?}",
+                self.state_fields, fields
+            )));
+        }
+
+        // now fields should be the same, so we can merge the batch
+        // by pass the columns as order should be the same
+        let state_columns = struct_arr.columns();
+        self.inner.merge_batch(state_columns)
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size()
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        self.inner.state()
+    }
+}
+
+#[cfg(test)]
+mod tests;
--- a/src/common/function/src/aggrs/aggr_wrapper/tests.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper/tests.rs
@@ -0,0 +1,804 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+
+use arrow::array::{ArrayRef, Float64Array, Int64Array, UInt64Array};
+use arrow::record_batch::RecordBatch;
+use arrow_schema::SchemaRef;
+use datafusion::catalog::{Session, TableProvider};
+use datafusion::datasource::DefaultTableSource;
+use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
+use datafusion::functions_aggregate::average::avg_udaf;
+use datafusion::functions_aggregate::sum::sum_udaf;
+use datafusion::optimizer::analyzer::type_coercion::TypeCoercion;
+use datafusion::optimizer::AnalyzerRule;
+use datafusion::physical_plan::aggregates::AggregateExec;
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
+use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
+use datafusion::prelude::SessionContext;
+use datafusion_common::{Column, TableReference};
+use datafusion_expr::expr::AggregateFunction;
+use datafusion_expr::sqlparser::ast::NullTreatment;
+use datafusion_expr::{Aggregate, Expr, LogicalPlan, SortExpr, TableScan};
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datatypes::arrow_array::StringArray;
+use futures::{Stream, StreamExt as _};
+use pretty_assertions::assert_eq;
+
+use super::*;
+use crate::aggrs::approximate::hll::HllState;
+use crate::aggrs::approximate::uddsketch::UddSketchState;
+use crate::aggrs::count_hash::CountHash;
+use crate::function::Function as _;
+use crate::scalars::hll_count::HllCalcFunction;
+use crate::scalars::uddsketch_calc::UddSketchCalcFunction;
+
+#[derive(Debug)]
+pub struct MockInputExec {
+    input: Vec<RecordBatch>,
+    schema: SchemaRef,
+    properties: PlanProperties,
+}
+
+impl MockInputExec {
+    pub fn new(input: Vec<RecordBatch>, schema: SchemaRef) -> Self {
+        Self {
+            properties: PlanProperties::new(
+                EquivalenceProperties::new(schema.clone()),
+                Partitioning::UnknownPartitioning(1),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            ),
+            input,
+            schema,
+        }
+    }
+}
+
+impl DisplayAs for MockInputExec {
+    fn fmt_as(&self, _t: DisplayFormatType, _f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        unimplemented!()
+    }
+}
+
+impl ExecutionPlan for MockInputExec {
+    fn name(&self) -> &str {
+        "MockInputExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> datafusion_common::Result<SendableRecordBatchStream> {
+        let stream = MockStream {
+            stream: self.input.clone(),
+            schema: self.schema.clone(),
+            idx: 0,
+        };
+        Ok(Box::pin(stream))
+    }
+}
+
+struct MockStream {
+    stream: Vec<RecordBatch>,
+    schema: SchemaRef,
+    idx: usize,
+}
+
+impl Stream for MockStream {
+    type Item = datafusion_common::Result<RecordBatch>;
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        _cx: &mut Context<'_>,
+    ) -> Poll<Option<datafusion_common::Result<RecordBatch>>> {
+        if self.idx < self.stream.len() {
+            let ret = self.stream[self.idx].clone();
+            self.idx += 1;
+            Poll::Ready(Some(Ok(ret)))
+        } else {
+            Poll::Ready(None)
+        }
+    }
+}
+
+impl RecordBatchStream for MockStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+#[derive(Debug)]
+struct DummyTableProvider {
+    schema: Arc<arrow_schema::Schema>,
+    record_batch: Mutex<Option<RecordBatch>>,
+}
+
+impl DummyTableProvider {
+    #[allow(unused)]
+    pub fn new(schema: Arc<arrow_schema::Schema>, record_batch: Option<RecordBatch>) -> Self {
+        Self {
+            schema,
+            record_batch: Mutex::new(record_batch),
+        }
+    }
+}
+
+impl Default for DummyTableProvider {
+    fn default() -> Self {
+        Self {
+            schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "number",
+                DataType::Int64,
+                true,
+            )])),
+            record_batch: Mutex::new(None),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl TableProvider for DummyTableProvider {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> Arc<arrow_schema::Schema> {
+        self.schema.clone()
+    }
+
+    fn table_type(&self) -> datafusion_expr::TableType {
+        datafusion_expr::TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        let input: Vec<RecordBatch> = self
+            .record_batch
+            .lock()
+            .unwrap()
+            .clone()
+            .map(|r| vec![r])
+            .unwrap_or_default();
+        Ok(Arc::new(MockInputExec::new(input, self.schema.clone())))
+    }
+}
+
+fn dummy_table_scan() -> LogicalPlan {
+    let table_provider = Arc::new(DummyTableProvider::default());
+    let table_source = DefaultTableSource::new(table_provider);
+    LogicalPlan::TableScan(
+        TableScan::try_new(
+            TableReference::bare("Number"),
+            Arc::new(table_source),
+            None,
+            vec![],
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+#[tokio::test]
+async fn test_sum_udaf() {
+    let ctx = SessionContext::new();
+
+    let sum = datafusion::functions_aggregate::sum::sum_udaf();
+    let sum = (*sum).clone();
+    let original_aggr = Aggregate::try_new(
+        Arc::new(dummy_table_scan()),
+        vec![],
+        vec![Expr::AggregateFunction(AggregateFunction::new_udf(
+            Arc::new(sum.clone()),
+            vec![Expr::Column(Column::new_unqualified("number"))],
+            false,
+            None,
+            None,
+            None,
+        ))],
+    )
+    .unwrap();
+    let res = StateMergeHelper::split_aggr_node(original_aggr).unwrap();
+
+    let expected_lower_plan = LogicalPlan::Aggregate(
+        Aggregate::try_new(
+            Arc::new(dummy_table_scan()),
+            vec![],
+            vec![Expr::AggregateFunction(AggregateFunction::new_udf(
+                Arc::new(StateWrapper::new(sum.clone()).unwrap().into()),
+                vec![Expr::Column(Column::new_unqualified("number"))],
+                false,
+                None,
+                None,
+                None,
+            ))],
+        )
+        .unwrap(),
+    )
+    .recompute_schema()
+    .unwrap();
+    assert_eq!(res.lower_state.as_ref(), &expected_lower_plan);
+
+    let expected_merge_plan = LogicalPlan::Aggregate(
+        Aggregate::try_new(
+            Arc::new(expected_lower_plan),
+            vec![],
+            vec![Expr::AggregateFunction(AggregateFunction::new_udf(
+                Arc::new(
+                    MergeWrapper::new(
+                        sum.clone(),
+                        Arc::new(
+                            AggregateExprBuilder::new(
+                                Arc::new(sum.clone()),
+                                vec![Arc::new(
+                                    datafusion::physical_expr::expressions::Column::new(
+                                        "number", 0,
+                                    ),
+                                )],
+                            )
+                            .schema(Arc::new(dummy_table_scan().schema().as_arrow().clone()))
+                            .alias("sum(number)")
+                            .build()
+                            .unwrap(),
+                        ),
+                        vec![DataType::Int64],
+                    )
+                    .unwrap()
+                    .into(),
+                ),
+                vec![Expr::Column(Column::new_unqualified("__sum_state(number)"))],
+                false,
+                None,
+                None,
+                None,
+            ))
+            .alias("sum(number)")],
+        )
+        .unwrap(),
+    );
+    assert_eq!(res.upper_merge.as_ref(), &expected_merge_plan);
+
+    let phy_aggr_state_plan = DefaultPhysicalPlanner::default()
+        .create_physical_plan(&res.lower_state, &ctx.state())
+        .await
+        .unwrap();
+    let aggr_exec = phy_aggr_state_plan
+        .as_any()
+        .downcast_ref::<AggregateExec>()
+        .unwrap();
+    let aggr_func_expr = &aggr_exec.aggr_expr()[0];
+    let mut state_accum = aggr_func_expr.create_accumulator().unwrap();
+
+    // evaluate the state function
+    let input = Int64Array::from(vec![Some(1), Some(2), None, Some(3)]);
+    let values = vec![Arc::new(input) as arrow::array::ArrayRef];
+
+    state_accum.update_batch(&values).unwrap();
+    let state = state_accum.state().unwrap();
+    assert_eq!(state.len(), 1);
+    assert_eq!(state[0], ScalarValue::Int64(Some(6)));
+
+    let eval_res = state_accum.evaluate().unwrap();
+    assert_eq!(
+        eval_res,
+        ScalarValue::Struct(Arc::new(
+            StructArray::try_new(
+                vec![Field::new("sum[sum]", DataType::Int64, true)].into(),
+                vec![Arc::new(Int64Array::from(vec![Some(6)]))],
+                None,
+            )
+            .unwrap(),
+        ))
+    );
+
+    let phy_aggr_merge_plan = DefaultPhysicalPlanner::default()
+        .create_physical_plan(&res.upper_merge, &ctx.state())
+        .await
+        .unwrap();
+    let aggr_exec = phy_aggr_merge_plan
+        .as_any()
+        .downcast_ref::<AggregateExec>()
+        .unwrap();
+    let aggr_func_expr = &aggr_exec.aggr_expr()[0];
+    let mut merge_accum = aggr_func_expr.create_accumulator().unwrap();
+
+    let merge_input =
+        vec![Arc::new(Int64Array::from(vec![Some(6), Some(42), None])) as arrow::array::ArrayRef];
+    let merge_input_struct_arr = StructArray::try_new(
+        vec![Field::new("sum[sum]", DataType::Int64, true)].into(),
+        merge_input,
+        None,
+    )
+    .unwrap();
+
+    merge_accum
+        .update_batch(&[Arc::new(merge_input_struct_arr)])
+        .unwrap();
+    let merge_state = merge_accum.state().unwrap();
+    assert_eq!(merge_state.len(), 1);
+    assert_eq!(merge_state[0], ScalarValue::Int64(Some(48)));
+
+    let merge_eval_res = merge_accum.evaluate().unwrap();
+    assert_eq!(merge_eval_res, ScalarValue::Int64(Some(48)));
+}
+
+#[tokio::test]
+async fn test_avg_udaf() {
+    let ctx = SessionContext::new();
+
+    let avg = datafusion::functions_aggregate::average::avg_udaf();
+    let avg = (*avg).clone();
+
+    let original_aggr = Aggregate::try_new(
+        Arc::new(dummy_table_scan()),
+        vec![],
+        vec![Expr::AggregateFunction(AggregateFunction::new_udf(
+            Arc::new(avg.clone()),
+            vec![Expr::Column(Column::new_unqualified("number"))],
+            false,
+            None,
+            None,
+            None,
+        ))],
+    )
+    .unwrap();
+    let res = StateMergeHelper::split_aggr_node(original_aggr).unwrap();
+
+    let state_func: Arc<AggregateUDF> = Arc::new(StateWrapper::new(avg.clone()).unwrap().into());
+    let expected_aggr_state_plan = LogicalPlan::Aggregate(
+        Aggregate::try_new(
+            Arc::new(dummy_table_scan()),
+            vec![],
+            vec![Expr::AggregateFunction(AggregateFunction::new_udf(
+                state_func,
+                vec![Expr::Column(Column::new_unqualified("number"))],
+                false,
+                None,
+                None,
+                None,
+            ))],
+        )
+        .unwrap(),
+    );
+    // type coerced so avg aggr function can function correctly
+    let coerced_aggr_state_plan = TypeCoercion::new()
+        .analyze(expected_aggr_state_plan.clone(), &Default::default())
+        .unwrap();
+    assert_eq!(res.lower_state.as_ref(), &coerced_aggr_state_plan);
+    assert_eq!(
+        res.lower_state.schema().as_arrow(),
+        &arrow_schema::Schema::new(vec![Field::new(
+            "__avg_state(number)",
+            DataType::Struct(
+                vec![
+                    Field::new("avg[count]", DataType::UInt64, true),
+                    Field::new("avg[sum]", DataType::Float64, true)
+                ]
+                .into()
+            ),
+            true,
+        )])
+    );
+
+    let expected_merge_fn = MergeWrapper::new(
+        avg.clone(),
+        Arc::new(
+            AggregateExprBuilder::new(
+                Arc::new(avg.clone()),
+                vec![Arc::new(
+                    datafusion::physical_expr::expressions::Column::new("number", 0),
+                )],
+            )
+            .schema(Arc::new(dummy_table_scan().schema().as_arrow().clone()))
+            .alias("avg(number)")
+            .build()
+            .unwrap(),
+        ),
+        // coerced to float64
+        vec![DataType::Float64],
+    )
+    .unwrap();
+
+    let expected_merge_plan = LogicalPlan::Aggregate(
+        Aggregate::try_new(
+            Arc::new(coerced_aggr_state_plan.clone()),
+            vec![],
+            vec![Expr::AggregateFunction(AggregateFunction::new_udf(
+                Arc::new(expected_merge_fn.into()),
+                vec![Expr::Column(Column::new_unqualified("__avg_state(number)"))],
+                false,
+                None,
+                None,
+                None,
+            ))
+            .alias("avg(number)")],
+        )
+        .unwrap(),
+    );
+    assert_eq!(res.upper_merge.as_ref(), &expected_merge_plan);
+
+    let phy_aggr_state_plan = DefaultPhysicalPlanner::default()
+        .create_physical_plan(&coerced_aggr_state_plan, &ctx.state())
+        .await
+        .unwrap();
+    let aggr_exec = phy_aggr_state_plan
+        .as_any()
+        .downcast_ref::<AggregateExec>()
+        .unwrap();
+    let aggr_func_expr = &aggr_exec.aggr_expr()[0];
+    let mut state_accum = aggr_func_expr.create_accumulator().unwrap();
+
+    // evaluate the state function
+    let input = Float64Array::from(vec![Some(1.), Some(2.), None, Some(3.)]);
+    let values = vec![Arc::new(input) as arrow::array::ArrayRef];
+
+    state_accum.update_batch(&values).unwrap();
+    let state = state_accum.state().unwrap();
+    assert_eq!(state.len(), 2);
+    assert_eq!(state[0], ScalarValue::UInt64(Some(3)));
+    assert_eq!(state[1], ScalarValue::Float64(Some(6.)));
+
+    let eval_res = state_accum.evaluate().unwrap();
+    let expected = Arc::new(
+        StructArray::try_new(
+            vec![
+                Field::new("avg[count]", DataType::UInt64, true),
+                Field::new("avg[sum]", DataType::Float64, true),
+            ]
+            .into(),
+            vec![
+                Arc::new(UInt64Array::from(vec![Some(3)])),
+                Arc::new(Float64Array::from(vec![Some(6.)])),
+            ],
+            None,
+        )
+        .unwrap(),
+    );
+    assert_eq!(eval_res, ScalarValue::Struct(expected));
+
+    let phy_aggr_merge_plan = DefaultPhysicalPlanner::default()
+        .create_physical_plan(&res.upper_merge, &ctx.state())
+        .await
+        .unwrap();
+    let aggr_exec = phy_aggr_merge_plan
+        .as_any()
+        .downcast_ref::<AggregateExec>()
+        .unwrap();
+    let aggr_func_expr = &aggr_exec.aggr_expr()[0];
+
+    let mut merge_accum = aggr_func_expr.create_accumulator().unwrap();
+
+    let merge_input = vec![
+        Arc::new(UInt64Array::from(vec![Some(3), Some(42), None])) as arrow::array::ArrayRef,
+        Arc::new(Float64Array::from(vec![Some(48.), Some(84.), None])),
+    ];
+    let merge_input_struct_arr = StructArray::try_new(
+        vec![
+            Field::new("avg[count]", DataType::UInt64, true),
+            Field::new("avg[sum]", DataType::Float64, true),
+        ]
+        .into(),
+        merge_input,
+        None,
+    )
+    .unwrap();
+
+    merge_accum
+        .update_batch(&[Arc::new(merge_input_struct_arr)])
+        .unwrap();
+    let merge_state = merge_accum.state().unwrap();
+    assert_eq!(merge_state.len(), 2);
+    assert_eq!(merge_state[0], ScalarValue::UInt64(Some(45)));
+    assert_eq!(merge_state[1], ScalarValue::Float64(Some(132.)));
+
+    let merge_eval_res = merge_accum.evaluate().unwrap();
+    // the merge function returns the average, which is 132 / 45
+    assert_eq!(merge_eval_res, ScalarValue::Float64(Some(132. / 45_f64)));
+}
+
+/// For testing whether the UDAF state fields are correctly implemented.
+/// esp. for our own custom UDAF's state fields.
+/// By compare eval results before and after split to state/merge functions.
+#[tokio::test]
+async fn test_udaf_correct_eval_result() {
+    struct TestCase {
+        func: Arc<AggregateUDF>,
+        args: Vec<Expr>,
+        input_schema: SchemaRef,
+        input: Vec<ArrayRef>,
+        expected_output: Option<ScalarValue>,
+        expected_fn: Option<ExpectedFn>,
+        distinct: bool,
+        filter: Option<Box<Expr>>,
+        order_by: Option<Vec<SortExpr>>,
+        null_treatment: Option<NullTreatment>,
+    }
+    type ExpectedFn = fn(ArrayRef) -> bool;
+
+    let test_cases = vec![
+        TestCase {
+            func: sum_udaf(),
+            input_schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "number",
+                DataType::Int64,
+                true,
+            )])),
+            args: vec![Expr::Column(Column::new_unqualified("number"))],
+            input: vec![Arc::new(Int64Array::from(vec![
+                Some(1),
+                Some(2),
+                None,
+                Some(3),
+            ]))],
+            expected_output: Some(ScalarValue::Int64(Some(6))),
+            expected_fn: None,
+            distinct: false,
+            filter: None,
+            order_by: None,
+            null_treatment: None,
+        },
+        TestCase {
+            func: avg_udaf(),
+            input_schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "number",
+                DataType::Int64,
+                true,
+            )])),
+            args: vec![Expr::Column(Column::new_unqualified("number"))],
+            input: vec![Arc::new(Int64Array::from(vec![
+                Some(1),
+                Some(2),
+                None,
+                Some(3),
+            ]))],
+            expected_output: Some(ScalarValue::Float64(Some(2.0))),
+            expected_fn: None,
+            distinct: false,
+            filter: None,
+            order_by: None,
+            null_treatment: None,
+        },
+        TestCase {
+            func: Arc::new(CountHash::udf_impl()),
+            input_schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "number",
+                DataType::Int64,
+                true,
+            )])),
+            args: vec![Expr::Column(Column::new_unqualified("number"))],
+            input: vec![Arc::new(Int64Array::from(vec![
+                Some(1),
+                Some(2),
+                None,
+                Some(3),
+                Some(3),
+                Some(3),
+            ]))],
+            expected_output: Some(ScalarValue::Int64(Some(4))),
+            expected_fn: None,
+            distinct: false,
+            filter: None,
+            order_by: None,
+            null_treatment: None,
+        },
+        TestCase {
+            func: Arc::new(UddSketchState::state_udf_impl()),
+            input_schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "number",
+                DataType::Float64,
+                true,
+            )])),
+            args: vec![
+                Expr::Literal(ScalarValue::Int64(Some(128))),
+                Expr::Literal(ScalarValue::Float64(Some(0.05))),
+                Expr::Column(Column::new_unqualified("number")),
+            ],
+            input: vec![Arc::new(Float64Array::from(vec![
+                Some(1.),
+                Some(2.),
+                None,
+                Some(3.),
+                Some(3.),
+                Some(3.),
+            ]))],
+            expected_output: None,
+            expected_fn: Some(|arr| {
+                let percent = ScalarValue::Float64(Some(0.5)).to_array().unwrap();
+                let percent = datatypes::vectors::Helper::try_into_vector(percent).unwrap();
+                let state = datatypes::vectors::Helper::try_into_vector(arr).unwrap();
+                let udd_calc = UddSketchCalcFunction;
+                let res = udd_calc
+                    .eval(&Default::default(), &[percent, state])
+                    .unwrap();
+                let binding = res.to_arrow_array();
+                let res_arr = binding.as_any().downcast_ref::<Float64Array>().unwrap();
+                assert!(res_arr.len() == 1);
+                assert!((res_arr.value(0) - 2.856578984907706f64).abs() <= f64::EPSILON);
+                true
+            }),
+            distinct: false,
+            filter: None,
+            order_by: None,
+            null_treatment: None,
+        },
+        TestCase {
+            func: Arc::new(HllState::state_udf_impl()),
+            input_schema: Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "word",
+                DataType::Utf8,
+                true,
+            )])),
+            args: vec![Expr::Column(Column::new_unqualified("word"))],
+            input: vec![Arc::new(StringArray::from(vec![
+                Some("foo"),
+                Some("bar"),
+                None,
+                Some("baz"),
+                Some("baz"),
+            ]))],
+            expected_output: None,
+            expected_fn: Some(|arr| {
+                let state = datatypes::vectors::Helper::try_into_vector(arr).unwrap();
+                let hll_calc = HllCalcFunction;
+                let res = hll_calc.eval(&Default::default(), &[state]).unwrap();
+                let binding = res.to_arrow_array();
+                let res_arr = binding.as_any().downcast_ref::<UInt64Array>().unwrap();
+                assert!(res_arr.len() == 1);
+                assert_eq!(res_arr.value(0), 3);
+                true
+            }),
+            distinct: false,
+            filter: None,
+            order_by: None,
+            null_treatment: None,
+        },
+        // TODO(discord9): udd_merge/hll_merge/geo_path/quantile_aggr tests
+    ];
+    let test_table_ref = TableReference::bare("TestTable");
+
+    for case in test_cases {
+        let ctx = SessionContext::new();
+        let table_provider = DummyTableProvider::new(
+            case.input_schema.clone(),
+            Some(RecordBatch::try_new(case.input_schema.clone(), case.input.clone()).unwrap()),
+        );
+        let table_source = DefaultTableSource::new(Arc::new(table_provider));
+        let logical_plan = LogicalPlan::TableScan(
+            TableScan::try_new(
+                test_table_ref.clone(),
+                Arc::new(table_source),
+                None,
+                vec![],
+                None,
+            )
+            .unwrap(),
+        );
+
+        let args = case.args;
+
+        let aggr_expr = Expr::AggregateFunction(AggregateFunction::new_udf(
+            case.func.clone(),
+            args,
+            case.distinct,
+            case.filter,
+            case.order_by,
+            case.null_treatment,
+        ));
+
+        let aggr_plan = LogicalPlan::Aggregate(
+            Aggregate::try_new(Arc::new(logical_plan), vec![], vec![aggr_expr]).unwrap(),
+        );
+
+        // make sure the aggr_plan is type coerced
+        let aggr_plan = TypeCoercion::new()
+            .analyze(aggr_plan, &Default::default())
+            .unwrap();
+
+        // first eval the original aggregate function
+        let phy_full_aggr_plan = DefaultPhysicalPlanner::default()
+            .create_physical_plan(&aggr_plan, &ctx.state())
+            .await
+            .unwrap();
+
+        {
+            let unsplit_result = execute_phy_plan(&phy_full_aggr_plan).await.unwrap();
+            assert_eq!(unsplit_result.len(), 1);
+            let unsplit_batch = &unsplit_result[0];
+            assert_eq!(unsplit_batch.num_columns(), 1);
+            assert_eq!(unsplit_batch.num_rows(), 1);
+            let unsplit_col = unsplit_batch.column(0);
+            if let Some(expected_output) = &case.expected_output {
+                assert_eq!(unsplit_col.data_type(), &expected_output.data_type());
+                assert_eq!(unsplit_col.len(), 1);
+                assert_eq!(unsplit_col, &expected_output.to_array().unwrap());
+            }
+
+            if let Some(expected_fn) = &case.expected_fn {
+                assert!(expected_fn(unsplit_col.clone()));
+            }
+        }
+        let LogicalPlan::Aggregate(aggr_plan) = aggr_plan else {
+            panic!("Expected Aggregate plan");
+        };
+        let split_plan = StateMergeHelper::split_aggr_node(aggr_plan).unwrap();
+
+        let phy_upper_plan = DefaultPhysicalPlanner::default()
+            .create_physical_plan(&split_plan.upper_merge, &ctx.state())
+            .await
+            .unwrap();
+
+        // since upper plan use lower plan as input, execute upper plan should also execute lower plan
+        // which should give the same result as the original aggregate function
+        {
+            let split_res = execute_phy_plan(&phy_upper_plan).await.unwrap();
+
+            assert_eq!(split_res.len(), 1);
+            let split_batch = &split_res[0];
+            assert_eq!(split_batch.num_columns(), 1);
+            assert_eq!(split_batch.num_rows(), 1);
+            let split_col = split_batch.column(0);
+            if let Some(expected_output) = &case.expected_output {
+                assert_eq!(split_col.data_type(), &expected_output.data_type());
+                assert_eq!(split_col.len(), 1);
+                assert_eq!(split_col, &expected_output.to_array().unwrap());
+            }
+
+            if let Some(expected_fn) = &case.expected_fn {
+                assert!(expected_fn(split_col.clone()));
+            }
+        }
+    }
+}
+
+async fn execute_phy_plan(
+    phy_plan: &Arc<dyn ExecutionPlan>,
+) -> datafusion_common::Result<Vec<RecordBatch>> {
+    let task_ctx = Arc::new(TaskContext::default());
+    let mut stream = phy_plan.execute(0, task_ctx)?;
+    let mut batches = Vec::new();
+    while let Some(batch) = stream.next().await {
+        batches.push(batch?);
+    }
+    Ok(batches)
+}
--- a/src/common/function/src/scalars/math/clamp.rs
+++ b/src/common/function/src/scalars/math/clamp.rs
@@ -81,7 +81,8 @@ impl Function for ClampFunction {
            }
        );
        ensure!(
-            columns[1].len() == 1 && columns[2].len() == 1,
+            (columns[1].len() == 1 || columns[1].is_const())
+                && (columns[2].len() == 1 || columns[2].is_const()),
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "The second and third args should be scalar, have: {:?}, {:?}",
@@ -204,7 +205,7 @@ impl Function for ClampMinFunction {
            }
        );
        ensure!(
-            columns[1].len() == 1,
+            columns[1].len() == 1 || columns[1].is_const(),
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "The second arg (min) should be scalar, have: {:?}",
@@ -292,7 +293,7 @@ impl Function for ClampMaxFunction {
            }
        );
        ensure!(
-            columns[1].len() == 1,
+            columns[1].len() == 1 || columns[1].is_const(),
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "The second arg (max) should be scalar, have: {:?}",
--- a/src/common/function/src/system/pg_catalog/version.rs
+++ b/src/common/function/src/system/pg_catalog/version.rs
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt;
 use std::sync::Arc;
-use std::{env, fmt};

 use common_query::error::Result;
 use common_query::prelude::{Signature, Volatility};
@@ -47,7 +47,7 @@ impl Function for PGVersionFunction {
    fn eval(&self, _func_ctx: &FunctionContext, _columns: &[VectorRef]) -> Result<VectorRef> {
        let result = StringVector::from(vec![format!(
            "PostgreSQL 16.3 GreptimeDB {}",
-            env!("CARGO_PKG_VERSION")
+            common_version::version()
        )]);
        Ok(Arc::new(result))
    }
--- a/src/common/function/src/system/version.rs
+++ b/src/common/function/src/system/version.rs
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt;
 use std::sync::Arc;
-use std::{env, fmt};

 use common_query::error::Result;
 use common_query::prelude::{Signature, Volatility};
@@ -52,13 +52,13 @@ impl Function for VersionFunction {
                    "{}-greptimedb-{}",
                    std::env::var("GREPTIMEDB_MYSQL_SERVER_VERSION")
                        .unwrap_or_else(|_| "8.4.2".to_string()),
-                    env!("CARGO_PKG_VERSION")
+                    common_version::version()
                )
            }
            Channel::Postgres => {
-                format!("16.3-greptimedb-{}", env!("CARGO_PKG_VERSION"))
+                format!("16.3-greptimedb-{}", common_version::version())
            }
-            _ => env!("CARGO_PKG_VERSION").to_string(),
+            _ => common_version::version().to_string(),
        };
        let result = StringVector::from(vec![version]);
        Ok(Arc::new(result))
--- a/src/common/greptimedb-telemetry/src/lib.rs
+++ b/src/common/greptimedb-telemetry/src/lib.rs
@@ -16,8 +16,8 @@ use std::env;
 use std::io::ErrorKind;
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use std::time::Duration;
+use std::sync::{Arc, LazyLock};
+use std::time::{Duration, SystemTime};

 use common_runtime::error::{Error, Result};
 use common_runtime::{BoxedTaskFunction, RepeatedTask, TaskFunction};
@@ -31,6 +31,9 @@ pub const TELEMETRY_URL: &str = "https://telemetry.greptimestats.com/db/otel/sta
 /// The local installation uuid cache file
 const UUID_FILE_NAME: &str = ".greptimedb-telemetry-uuid";

+/// System start time for uptime calculation
+static START_TIME: LazyLock<SystemTime> = LazyLock::new(SystemTime::now);
+
 /// The default interval of reporting telemetry data to greptime cloud
 pub static TELEMETRY_INTERVAL: Duration = Duration::from_secs(60 * 30);
 /// The default connect timeout to greptime cloud.
@@ -103,6 +106,8 @@ struct StatisticData {
    pub nodes: Option<i32>,
    /// The local installation uuid
    pub uuid: String,
+    /// System uptime range (e.g., "hours", "days", "weeks")
+    pub uptime: String,
 }

 #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
@@ -171,6 +176,25 @@ fn print_anonymous_usage_data_disclaimer() {
    info!("https://docs.greptime.com/reference/telemetry");
 }

+/// Format uptime duration into a general time range string
+/// Returns privacy-friendly descriptions like "hours", "days", etc.
+fn format_uptime() -> String {
+    let uptime_duration = START_TIME.elapsed().unwrap_or(Duration::ZERO);
+    let total_seconds = uptime_duration.as_secs();
+
+    if total_seconds < 86400 {
+        "hours".to_string()
+    } else if total_seconds < 604800 {
+        "days".to_string()
+    } else if total_seconds < 2629746 {
+        "weeks".to_string()
+    } else if total_seconds < 31556952 {
+        "months".to_string()
+    } else {
+        "years".to_string()
+    }
+}
+
 pub fn default_get_uuid(working_home: &Option<String>) -> Option<String> {
    let temp_dir = env::temp_dir();

@@ -260,6 +284,7 @@ impl GreptimeDBTelemetry {
                    mode: self.statistics.get_mode(),
                    nodes: self.statistics.get_nodes().await,
                    uuid,
+                    uptime: format_uptime(),
                };

                if let Some(client) = self.client.as_ref() {
@@ -294,7 +319,9 @@ mod tests {
    use reqwest::{Client, Response};
    use tokio::spawn;

-    use crate::{default_get_uuid, Collector, GreptimeDBTelemetry, Mode, StatisticData};
+    use crate::{
+        default_get_uuid, format_uptime, Collector, GreptimeDBTelemetry, Mode, StatisticData,
+    };

    static COUNT: AtomicUsize = std::sync::atomic::AtomicUsize::new(0);

@@ -438,6 +465,7 @@ mod tests {
        assert_eq!(build_info().commit, body.git_commit);
        assert_eq!(Mode::Standalone, body.mode);
        assert_eq!(1, body.nodes.unwrap());
+        assert!(!body.uptime.is_empty());

        let failed_statistic = Box::new(FailedStatistic);
        let failed_report = GreptimeDBTelemetry::new(
@@ -477,4 +505,18 @@ mod tests {
        assert_eq!(uuid, default_get_uuid(&Some(working_home.clone())));
        assert_eq!(uuid, default_get_uuid(&Some(working_home)));
    }
+
+    #[test]
+    fn test_format_uptime() {
+        let uptime = format_uptime();
+        assert!(!uptime.is_empty());
+        // Should be a valid general time range (no specific numbers)
+        assert!(
+            uptime == "hours"
+                || uptime == "days"
+                || uptime == "weeks"
+                || uptime == "months"
+                || uptime == "years"
+        );
+    }
 }
--- a/src/common/grpc-expr/Cargo.toml
+++ b/src/common/grpc-expr/Cargo.toml
@@ -14,6 +14,7 @@ common-catalog.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-query.workspace = true
+common-sql.workspace = true
 common-time.workspace = true
 datatypes.workspace = true
 prost.workspace = true
--- a/src/common/grpc-expr/src/alter.rs
+++ b/src/common/grpc-expr/src/alter.rs
@@ -27,24 +27,84 @@ use common_query::AddColumnLocation;
 use datatypes::schema::{ColumnSchema, FulltextOptions, RawSchema, SkippingIndexOptions};
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::region_request::{SetRegionOption, UnsetRegionOption};
-use table::metadata::TableId;
+use table::metadata::{TableId, TableMeta};
 use table::requests::{
-    AddColumnRequest, AlterKind, AlterTableRequest, ModifyColumnTypeRequest, SetIndexOptions,
-    UnsetIndexOptions,
+    AddColumnRequest, AlterKind, AlterTableRequest, ModifyColumnTypeRequest, SetDefaultRequest,
+    SetIndexOption, UnsetIndexOption,
 };

 use crate::error::{
-    InvalidColumnDefSnafu, InvalidIndexOptionSnafu, InvalidSetFulltextOptionRequestSnafu,
-    InvalidSetSkippingIndexOptionRequestSnafu, InvalidSetTableOptionRequestSnafu,
-    InvalidUnsetTableOptionRequestSnafu, MissingAlterIndexOptionSnafu, MissingFieldSnafu,
+    ColumnNotFoundSnafu, InvalidColumnDefSnafu, InvalidIndexOptionSnafu,
+    InvalidSetFulltextOptionRequestSnafu, InvalidSetSkippingIndexOptionRequestSnafu,
+    InvalidSetTableOptionRequestSnafu, InvalidUnsetTableOptionRequestSnafu,
+    MissingAlterIndexOptionSnafu, MissingFieldSnafu, MissingTableMetaSnafu,
    MissingTimestampColumnSnafu, Result, UnknownLocationTypeSnafu,
 };

 const LOCATION_TYPE_FIRST: i32 = LocationType::First as i32;
 const LOCATION_TYPE_AFTER: i32 = LocationType::After as i32;

+fn set_index_option_from_proto(set_index: api::v1::SetIndex) -> Result<SetIndexOption> {
+    let options = set_index.options.context(MissingAlterIndexOptionSnafu)?;
+    Ok(match options {
+        api::v1::set_index::Options::Fulltext(f) => SetIndexOption::Fulltext {
+            column_name: f.column_name.clone(),
+            options: FulltextOptions::new(
+                f.enable,
+                as_fulltext_option_analyzer(
+                    Analyzer::try_from(f.analyzer).context(InvalidSetFulltextOptionRequestSnafu)?,
+                ),
+                f.case_sensitive,
+                as_fulltext_option_backend(
+                    PbFulltextBackend::try_from(f.backend)
+                        .context(InvalidSetFulltextOptionRequestSnafu)?,
+                ),
+                f.granularity as u32,
+                f.false_positive_rate,
+            )
+            .context(InvalidIndexOptionSnafu)?,
+        },
+        api::v1::set_index::Options::Inverted(i) => SetIndexOption::Inverted {
+            column_name: i.column_name,
+        },
+        api::v1::set_index::Options::Skipping(s) => SetIndexOption::Skipping {
+            column_name: s.column_name,
+            options: SkippingIndexOptions::new(
+                s.granularity as u32,
+                s.false_positive_rate,
+                as_skipping_index_type(
+                    PbSkippingIndexType::try_from(s.skipping_index_type)
+                        .context(InvalidSetSkippingIndexOptionRequestSnafu)?,
+                ),
+            )
+            .context(InvalidIndexOptionSnafu)?,
+        },
+    })
+}
+
+fn unset_index_option_from_proto(unset_index: api::v1::UnsetIndex) -> Result<UnsetIndexOption> {
+    let options = unset_index.options.context(MissingAlterIndexOptionSnafu)?;
+    Ok(match options {
+        api::v1::unset_index::Options::Fulltext(f) => UnsetIndexOption::Fulltext {
+            column_name: f.column_name,
+        },
+        api::v1::unset_index::Options::Inverted(i) => UnsetIndexOption::Inverted {
+            column_name: i.column_name,
+        },
+        api::v1::unset_index::Options::Skipping(s) => UnsetIndexOption::Skipping {
+            column_name: s.column_name,
+        },
+    })
+}
+
 /// Convert an [`AlterTableExpr`] to an [`AlterTableRequest`]
-pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result<AlterTableRequest> {
+///
+/// note: `table_meta` must not be None if [`AlterTableExpr`] is `SetDefault`
+pub fn alter_expr_to_request(
+    table_id: TableId,
+    expr: AlterTableExpr,
+    table_meta: Option<&TableMeta>,
+) -> Result<AlterTableRequest> {
    let catalog_name = expr.catalog_name;
    let schema_name = expr.schema_name;
    let kind = expr.kind.context(MissingFieldSnafu { field: "kind" })?;
@@ -121,70 +181,34 @@ pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result<
                    .context(InvalidUnsetTableOptionRequestSnafu)?,
            }
        }
-        Kind::SetIndex(o) => match o.options {
-            Some(opt) => match opt {
-                api::v1::set_index::Options::Fulltext(f) => AlterKind::SetIndex {
-                    options: SetIndexOptions::Fulltext {
-                        column_name: f.column_name.clone(),
-                        options: FulltextOptions::new(
-                            f.enable,
-                            as_fulltext_option_analyzer(
-                                Analyzer::try_from(f.analyzer)
-                                    .context(InvalidSetFulltextOptionRequestSnafu)?,
-                            ),
-                            f.case_sensitive,
-                            as_fulltext_option_backend(
-                                PbFulltextBackend::try_from(f.backend)
-                                    .context(InvalidSetFulltextOptionRequestSnafu)?,
-                            ),
-                            f.granularity as u32,
-                            f.false_positive_rate,
-                        )
-                        .context(InvalidIndexOptionSnafu)?,
-                    },
-                },
-                api::v1::set_index::Options::Inverted(i) => AlterKind::SetIndex {
-                    options: SetIndexOptions::Inverted {
-                        column_name: i.column_name,
-                    },
-                },
-                api::v1::set_index::Options::Skipping(s) => AlterKind::SetIndex {
-                    options: SetIndexOptions::Skipping {
-                        column_name: s.column_name,
-                        options: SkippingIndexOptions::new(
-                            s.granularity as u32,
-                            s.false_positive_rate,
-                            as_skipping_index_type(
-                                PbSkippingIndexType::try_from(s.skipping_index_type)
-                                    .context(InvalidSetSkippingIndexOptionRequestSnafu)?,
-                            ),
-                        )
-                        .context(InvalidIndexOptionSnafu)?,
-                    },
-                },
-            },
-            None => return MissingAlterIndexOptionSnafu.fail(),
-        },
-        Kind::UnsetIndex(o) => match o.options {
-            Some(opt) => match opt {
-                api::v1::unset_index::Options::Fulltext(f) => AlterKind::UnsetIndex {
-                    options: UnsetIndexOptions::Fulltext {
-                        column_name: f.column_name,
-                    },
-                },
-                api::v1::unset_index::Options::Inverted(i) => AlterKind::UnsetIndex {
-                    options: UnsetIndexOptions::Inverted {
-                        column_name: i.column_name,
-                    },
-                },
-                api::v1::unset_index::Options::Skipping(s) => AlterKind::UnsetIndex {
-                    options: UnsetIndexOptions::Skipping {
-                        column_name: s.column_name,
-                    },
-                },
-            },
-            None => return MissingAlterIndexOptionSnafu.fail(),
-        },
+        Kind::SetIndex(o) => {
+            let option = set_index_option_from_proto(o)?;
+            AlterKind::SetIndexes {
+                options: vec![option],
+            }
+        }
+        Kind::UnsetIndex(o) => {
+            let option = unset_index_option_from_proto(o)?;
+            AlterKind::UnsetIndexes {
+                options: vec![option],
+            }
+        }
+        Kind::SetIndexes(o) => {
+            let options = o
+                .set_indexes
+                .into_iter()
+                .map(set_index_option_from_proto)
+                .collect::<Result<Vec<_>>>()?;
+            AlterKind::SetIndexes { options }
+        }
+        Kind::UnsetIndexes(o) => {
+            let options = o
+                .unset_indexes
+                .into_iter()
+                .map(unset_index_option_from_proto)
+                .collect::<Result<Vec<_>>>()?;
+            AlterKind::UnsetIndexes { options }
+        }
        Kind::DropDefaults(o) => {
            let names = o
                .drop_defaults
@@ -201,6 +225,32 @@ pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result<
                .collect::<Result<Vec<_>>>()?;
            AlterKind::DropDefaults { names }
        }
+        Kind::SetDefaults(o) => {
+            let table_meta = table_meta.context(MissingTableMetaSnafu { table_id })?;
+            let defaults = o
+                .set_defaults
+                .into_iter()
+                .map(|col| {
+                    let column_scheme = table_meta
+                        .schema
+                        .column_schema_by_name(&col.column_name)
+                        .context(ColumnNotFoundSnafu {
+                        column_name: &col.column_name,
+                    })?;
+                    let default_constraint = common_sql::convert::deserialize_default_constraint(
+                        col.default_constraint.as_slice(),
+                        &col.column_name,
+                        &column_scheme.data_type,
+                    )
+                    .context(crate::error::SqlCommonSnafu)?;
+                    Ok(SetDefaultRequest {
+                        column_name: col.column_name,
+                        default_constraint,
+                    })
+                })
+                .collect::<Result<Vec<_>>>()?;
+            AlterKind::SetDefaults { defaults }
+        }
    };

    let request = AlterTableRequest {
@@ -300,7 +350,7 @@ mod tests {
            })),
        };

-        let alter_request = alter_expr_to_request(1, expr).unwrap();
+        let alter_request = alter_expr_to_request(1, expr, None).unwrap();
        assert_eq!(alter_request.catalog_name, "");
        assert_eq!(alter_request.schema_name, "");
        assert_eq!("monitor".to_string(), alter_request.table_name);
@@ -364,7 +414,7 @@ mod tests {
            })),
        };

-        let alter_request = alter_expr_to_request(1, expr).unwrap();
+        let alter_request = alter_expr_to_request(1, expr, None).unwrap();
        assert_eq!(alter_request.catalog_name, "");
        assert_eq!(alter_request.schema_name, "");
        assert_eq!("monitor".to_string(), alter_request.table_name);
@@ -416,7 +466,7 @@ mod tests {
            })),
        };

-        let alter_request = alter_expr_to_request(1, expr).unwrap();
+        let alter_request = alter_expr_to_request(1, expr, None).unwrap();
        assert_eq!(alter_request.catalog_name, "test_catalog");
        assert_eq!(alter_request.schema_name, "test_schema");
        assert_eq!("monitor".to_string(), alter_request.table_name);
@@ -448,7 +498,7 @@ mod tests {
            })),
        };

-        let alter_request = alter_expr_to_request(1, expr).unwrap();
+        let alter_request = alter_expr_to_request(1, expr, None).unwrap();
        assert_eq!(alter_request.catalog_name, "test_catalog");
        assert_eq!(alter_request.schema_name, "test_schema");
        assert_eq!("monitor".to_string(), alter_request.table_name);
--- a/src/common/grpc-expr/src/error.rs
+++ b/src/common/grpc-expr/src/error.rs
@@ -161,6 +161,27 @@ pub enum Error {
        #[snafu(source)]
        error: datatypes::error::Error,
    },
+
+    #[snafu(display("Sql common error"))]
+    SqlCommon {
+        source: common_sql::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Missing required field in protobuf, column name: {}", column_name))]
+    ColumnNotFound {
+        column_name: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Need table metadata, but not found, table_id: {}", table_id))]
+    MissingTableMeta {
+        table_id: u32,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -190,6 +211,9 @@ impl ErrorExt for Error {
            | Error::InvalidSetSkippingIndexOptionRequest { .. }
            | Error::MissingAlterIndexOption { .. }
            | Error::InvalidIndexOption { .. } => StatusCode::InvalidArguments,
+            Error::ColumnNotFound { .. } => StatusCode::TableColumnNotFound,
+            Error::SqlCommon { source, .. } => source.status_code(),
+            Error::MissingTableMeta { .. } => StatusCode::Unexpected,
        }
    }

--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -31,6 +31,7 @@ tokio.workspace = true
 tokio-util.workspace = true
 tonic.workspace = true
 tower.workspace = true
+vec1 = "1.12"

 [dev-dependencies]
 criterion = "0.4"
--- a/src/common/grpc/benches/bench_flight_decoder.rs
+++ b/src/common/grpc/benches/bench_flight_decoder.rs
@@ -84,9 +84,12 @@ fn prepare_random_record_batch(
 fn prepare_flight_data(num_rows: usize) -> (FlightData, FlightData) {
    let schema = schema();
    let mut encoder = FlightEncoder::default();
-    let schema_data = encoder.encode(FlightMessage::Schema(schema.clone()));
+    let schema_data = encoder.encode_schema(schema.as_ref());
    let rb = prepare_random_record_batch(schema, num_rows);
-    let rb_data = encoder.encode(FlightMessage::RecordBatch(rb));
+    let [rb_data] = encoder
+        .encode(FlightMessage::RecordBatch(rb))
+        .try_into()
+        .unwrap();
    (schema_data, rb_data)
 }

@@ -96,7 +99,7 @@ fn decode_flight_data_from_protobuf(schema: &Bytes, payload: &Bytes) -> DfRecord
    let mut decoder = FlightDecoder::default();
    let _schema = decoder.try_decode(&schema).unwrap();
    let message = decoder.try_decode(&payload).unwrap();
-    let FlightMessage::RecordBatch(batch) = message else {
+    let Some(FlightMessage::RecordBatch(batch)) = message else {
        unreachable!("unexpected message");
    };
    batch
--- a/src/common/grpc/src/flight.rs
+++ b/src/common/grpc/src/flight.rs
@@ -23,6 +23,7 @@ use arrow_flight::{FlightData, SchemaAsIpc};
 use common_base::bytes::Bytes;
 use common_recordbatch::DfRecordBatch;
 use datatypes::arrow;
+use datatypes::arrow::array::ArrayRef;
 use datatypes::arrow::buffer::Buffer;
 use datatypes::arrow::datatypes::{Schema as ArrowSchema, SchemaRef};
 use datatypes::arrow::error::ArrowError;
@@ -31,6 +32,7 @@ use flatbuffers::FlatBufferBuilder;
 use prost::bytes::Bytes as ProstBytes;
 use prost::Message;
 use snafu::{OptionExt, ResultExt};
+use vec1::{vec1, Vec1};

 use crate::error;
 use crate::error::{DecodeFlightDataSnafu, InvalidFlightDataSnafu, Result};
@@ -77,9 +79,19 @@ impl FlightEncoder {
        }
    }

-    pub fn encode(&mut self, flight_message: FlightMessage) -> FlightData {
+    /// Encode the Arrow schema to [FlightData].
+    pub fn encode_schema(&self, schema: &ArrowSchema) -> FlightData {
+        SchemaAsIpc::new(schema, &self.write_options).into()
+    }
+
+    /// Encode the [FlightMessage] to a list (at least one element) of [FlightData]s.
+    ///
+    /// Normally only when the [FlightMessage] is an Arrow [RecordBatch] with dictionary arrays
+    /// will the encoder produce more than one [FlightData]s. Other types of [FlightMessage] should
+    /// be encoded to exactly one [FlightData].
+    pub fn encode(&mut self, flight_message: FlightMessage) -> Vec1<FlightData> {
        match flight_message {
-            FlightMessage::Schema(schema) => SchemaAsIpc::new(&schema, &self.write_options).into(),
+            FlightMessage::Schema(schema) => vec1![self.encode_schema(schema.as_ref())],
            FlightMessage::RecordBatch(record_batch) => {
                let (encoded_dictionaries, encoded_batch) = self
                    .data_gen
@@ -90,14 +102,10 @@ impl FlightEncoder {
                    )
                    .expect("DictionaryTracker configured above to not fail on replacement");

-                // TODO(LFC): Handle dictionary as FlightData here, when we supported Arrow's Dictionary DataType.
-                // Currently we don't have a datatype corresponding to Arrow's Dictionary DataType,
-                // so there won't be any "dictionaries" here. Assert to be sure about it, and
-                // perform a "testing guard" in case we forgot to handle the possible "dictionaries"
-                // here in the future.
-                debug_assert_eq!(encoded_dictionaries.len(), 0);
-
-                encoded_batch.into()
+                Vec1::from_vec_push(
+                    encoded_dictionaries.into_iter().map(Into::into).collect(),
+                    encoded_batch.into(),
+                )
            }
            FlightMessage::AffectedRows(rows) => {
                let metadata = FlightMetadata {
@@ -105,12 +113,12 @@ impl FlightEncoder {
                    metrics: None,
                }
                .encode_to_vec();
-                FlightData {
+                vec1![FlightData {
                    flight_descriptor: None,
                    data_header: build_none_flight_msg().into(),
                    app_metadata: metadata.into(),
                    data_body: ProstBytes::default(),
-                }
+                }]
            }
            FlightMessage::Metrics(s) => {
                let metadata = FlightMetadata {
@@ -120,12 +128,12 @@ impl FlightEncoder {
                    }),
                }
                .encode_to_vec();
-                FlightData {
+                vec1![FlightData {
                    flight_descriptor: None,
                    data_header: build_none_flight_msg().into(),
                    app_metadata: metadata.into(),
                    data_body: ProstBytes::default(),
-                }
+                }]
            }
        }
    }
@@ -135,6 +143,7 @@ impl FlightEncoder {
 pub struct FlightDecoder {
    schema: Option<SchemaRef>,
    schema_bytes: Option<bytes::Bytes>,
+    dictionaries_by_id: HashMap<i64, ArrayRef>,
 }

 impl FlightDecoder {
@@ -145,6 +154,7 @@ impl FlightDecoder {
        Ok(Self {
            schema: Some(Arc::new(arrow_schema)),
            schema_bytes: Some(schema_bytes.clone()),
+            dictionaries_by_id: HashMap::new(),
        })
    }

@@ -186,7 +196,13 @@ impl FlightDecoder {
        Ok(result)
    }

-    pub fn try_decode(&mut self, flight_data: &FlightData) -> Result<FlightMessage> {
+    /// Try to decode the [FlightData] to a [FlightMessage].
+    ///
+    /// If the [FlightData] is of type `DictionaryBatch` (produced while encoding an Arrow
+    /// [RecordBatch] with dictionary arrays), the decoder will not return any [FlightMessage]s.
+    /// Instead, it will update its internal dictionary cache. Other types of [FlightData] will
+    /// be decoded to exactly one [FlightMessage].
+    pub fn try_decode(&mut self, flight_data: &FlightData) -> Result<Option<FlightMessage>> {
        let message = root_as_message(&flight_data.data_header).map_err(|e| {
            InvalidFlightDataSnafu {
                reason: e.to_string(),
@@ -198,12 +214,12 @@ impl FlightDecoder {
                let metadata = FlightMetadata::decode(flight_data.app_metadata.clone())
                    .context(DecodeFlightDataSnafu)?;
                if let Some(AffectedRows { value }) = metadata.affected_rows {
-                    return Ok(FlightMessage::AffectedRows(value as _));
+                    return Ok(Some(FlightMessage::AffectedRows(value as _)));
                }
                if let Some(Metrics { metrics }) = metadata.metrics {
-                    return Ok(FlightMessage::Metrics(
+                    return Ok(Some(FlightMessage::Metrics(
                        String::from_utf8_lossy(&metrics).to_string(),
-                    ));
+                    )));
                }
                InvalidFlightDataSnafu {
                    reason: "Expecting FlightMetadata have some meaningful content.",
@@ -219,21 +235,46 @@ impl FlightDecoder {
                })?);
                self.schema = Some(arrow_schema.clone());
                self.schema_bytes = Some(flight_data.data_header.clone());
-                Ok(FlightMessage::Schema(arrow_schema))
+                Ok(Some(FlightMessage::Schema(arrow_schema)))
            }
            MessageHeader::RecordBatch => {
                let schema = self.schema.clone().context(InvalidFlightDataSnafu {
                    reason: "Should have decoded schema first!",
                })?;
-                let arrow_batch =
-                    flight_data_to_arrow_batch(flight_data, schema.clone(), &HashMap::new())
-                        .map_err(|e| {
-                            InvalidFlightDataSnafu {
-                                reason: e.to_string(),
-                            }
-                            .build()
+                let arrow_batch = flight_data_to_arrow_batch(
+                    flight_data,
+                    schema.clone(),
+                    &self.dictionaries_by_id,
+                )
+                .map_err(|e| {
+                    InvalidFlightDataSnafu {
+                        reason: e.to_string(),
+                    }
+                    .build()
+                })?;
+                Ok(Some(FlightMessage::RecordBatch(arrow_batch)))
+            }
+            MessageHeader::DictionaryBatch => {
+                let dictionary_batch =
+                    message
+                        .header_as_dictionary_batch()
+                        .context(InvalidFlightDataSnafu {
+                            reason: "could not get dictionary batch from DictionaryBatch message",
                        })?;
-                Ok(FlightMessage::RecordBatch(arrow_batch))
+
+                let schema = self.schema.as_ref().context(InvalidFlightDataSnafu {
+                    reason: "schema message is not present previously",
+                })?;
+
+                reader::read_dictionary(
+                    &flight_data.data_body.clone().into(),
+                    dictionary_batch,
+                    schema,
+                    &mut self.dictionaries_by_id,
+                    &message.version(),
+                )
+                .context(error::ArrowSnafu)?;
+                Ok(None)
            }
            other => {
                let name = other.variant_name().unwrap_or("UNKNOWN");
@@ -305,14 +346,16 @@ fn build_none_flight_msg() -> Bytes {
 #[cfg(test)]
 mod test {
    use arrow_flight::utils::batches_to_flight_data;
-    use datatypes::arrow::array::Int32Array;
+    use datatypes::arrow::array::{
+        DictionaryArray, Int32Array, StringArray, UInt32Array, UInt8Array,
+    };
    use datatypes::arrow::datatypes::{DataType, Field, Schema};

    use super::*;
    use crate::Error;

    #[test]
-    fn test_try_decode() {
+    fn test_try_decode() -> Result<()> {
        let schema = Arc::new(ArrowSchema::new(vec![Field::new(
            "n",
            DataType::Int32,
@@ -347,7 +390,7 @@ mod test {
            .to_string()
            .contains("Should have decoded schema first!"));

-        let message = decoder.try_decode(d1).unwrap();
+        let message = decoder.try_decode(d1)?.unwrap();
        assert!(matches!(message, FlightMessage::Schema(_)));
        let FlightMessage::Schema(decoded_schema) = message else {
            unreachable!()
@@ -356,19 +399,20 @@ mod test {

        let _ = decoder.schema.as_ref().unwrap();

-        let message = decoder.try_decode(d2).unwrap();
+        let message = decoder.try_decode(d2)?.unwrap();
        assert!(matches!(message, FlightMessage::RecordBatch(_)));
        let FlightMessage::RecordBatch(actual_batch) = message else {
            unreachable!()
        };
        assert_eq!(actual_batch, batch1);

-        let message = decoder.try_decode(d3).unwrap();
+        let message = decoder.try_decode(d3)?.unwrap();
        assert!(matches!(message, FlightMessage::RecordBatch(_)));
        let FlightMessage::RecordBatch(actual_batch) = message else {
            unreachable!()
        };
        assert_eq!(actual_batch, batch2);
+        Ok(())
    }

    #[test]
@@ -407,4 +451,86 @@ mod test {
        let actual = flight_messages_to_recordbatches(vec![m1, m2, m3]).unwrap();
        assert_eq!(actual, recordbatches);
    }
+
+    #[test]
+    fn test_flight_encode_decode_with_dictionary_array() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("i", DataType::UInt8, true),
+            Field::new_dictionary("s", DataType::UInt32, DataType::Utf8, true),
+        ]));
+        let batch1 = DfRecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(UInt8Array::from_iter_values(vec![1, 2, 3])) as _,
+                Arc::new(DictionaryArray::new(
+                    UInt32Array::from_value(0, 3),
+                    Arc::new(StringArray::from_iter_values(["x"])),
+                )) as _,
+            ],
+        )
+        .unwrap();
+        let batch2 = DfRecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(UInt8Array::from_iter_values(vec![4, 5, 6, 7, 8])) as _,
+                Arc::new(DictionaryArray::new(
+                    UInt32Array::from_iter_values([0, 1, 2, 2, 3]),
+                    Arc::new(StringArray::from_iter_values(["h", "e", "l", "o"])),
+                )) as _,
+            ],
+        )
+        .unwrap();
+
+        let message_1 = FlightMessage::Schema(schema.clone());
+        let message_2 = FlightMessage::RecordBatch(batch1);
+        let message_3 = FlightMessage::RecordBatch(batch2);
+
+        let mut encoder = FlightEncoder::default();
+        let encoded_1 = encoder.encode(message_1);
+        let encoded_2 = encoder.encode(message_2);
+        let encoded_3 = encoder.encode(message_3);
+        // message 1 is Arrow Schema, should be encoded to one FlightData:
+        assert_eq!(encoded_1.len(), 1);
+        // message 2 and 3 are Arrow RecordBatch with dictionary arrays, should be encoded to
+        // multiple FlightData:
+        assert_eq!(encoded_2.len(), 2);
+        assert_eq!(encoded_3.len(), 2);
+
+        let mut decoder = FlightDecoder::default();
+        let decoded_1 = decoder.try_decode(encoded_1.first())?;
+        let Some(FlightMessage::Schema(actual_schema)) = decoded_1 else {
+            unreachable!()
+        };
+        assert_eq!(actual_schema, schema);
+        let decoded_2 = decoder.try_decode(&encoded_2[0])?;
+        // expected to be a dictionary batch message, decoder should return none:
+        assert!(decoded_2.is_none());
+        let Some(FlightMessage::RecordBatch(decoded_2)) = decoder.try_decode(&encoded_2[1])? else {
+            unreachable!()
+        };
+        let decoded_3 = decoder.try_decode(&encoded_3[0])?;
+        // expected to be a dictionary batch message, decoder should return none:
+        assert!(decoded_3.is_none());
+        let Some(FlightMessage::RecordBatch(decoded_3)) = decoder.try_decode(&encoded_3[1])? else {
+            unreachable!()
+        };
+        let actual = arrow::util::pretty::pretty_format_batches(&[decoded_2, decoded_3])
+            .unwrap()
+            .to_string();
+        let expected = r"
+---+---+
+| i | s |
+---+---+
+| 1 | x |
+| 2 | x |
+| 3 | x |
+| 4 | h |
+| 5 | e |
+| 6 | l |
+| 7 | l |
+| 8 | o |
+---+---+";
+        assert_eq!(actual, expected.trim());
+        Ok(())
+    }
 }
--- a/src/common/grpc/src/select.rs
+++ b/src/common/grpc/src/select.rs
@@ -70,7 +70,7 @@ macro_rules! convert_arrow_array_to_grpc_vals {
                    return Ok(vals);
                },
            )+
-            ConcreteDataType::Null(_) | ConcreteDataType::List(_) | ConcreteDataType::Dictionary(_) | ConcreteDataType::Duration(_) | ConcreteDataType::Json(_) => unreachable!("Should not send {:?} in gRPC", $data_type),
+            ConcreteDataType::Null(_) | ConcreteDataType::List(_) | ConcreteDataType::Struct(_) | ConcreteDataType::Dictionary(_) | ConcreteDataType::Duration(_) | ConcreteDataType::Json(_) => unreachable!("Should not send {:?} in gRPC", $data_type),
        }
    }};
 }
--- a/src/common/mem-prof/src/jemalloc.rs
+++ b/src/common/mem-prof/src/jemalloc.rs
@@ -19,8 +19,8 @@ use std::io::BufReader;
 use std::path::PathBuf;

 use error::{
-    BuildTempPathSnafu, DumpProfileDataSnafu, OpenTempFileSnafu, ProfilingNotEnabledSnafu,
-    ReadOptProfSnafu,
+    ActivateProfSnafu, BuildTempPathSnafu, DeactivateProfSnafu, DumpProfileDataSnafu,
+    OpenTempFileSnafu, ProfilingNotEnabledSnafu, ReadOptProfSnafu, ReadProfActiveSnafu,
 };
 use jemalloc_pprof_mappings::MAPPINGS;
 use jemalloc_pprof_utils::{parse_jeheap, FlamegraphOptions, StackProfile};
@@ -31,6 +31,7 @@ use crate::error::{FlamegraphSnafu, ParseJeHeapSnafu, Result};

 const PROF_DUMP: &[u8] = b"prof.dump\0";
 const OPT_PROF: &[u8] = b"opt.prof\0";
+const PROF_ACTIVE: &[u8] = b"prof.active\0";

 pub async fn dump_profile() -> Result<Vec<u8>> {
    ensure!(is_prof_enabled()?, ProfilingNotEnabledSnafu);
@@ -93,6 +94,27 @@ pub async fn dump_flamegraph() -> Result<Vec<u8>> {
    let flamegraph = profile.to_flamegraph(&mut opts).context(FlamegraphSnafu)?;
    Ok(flamegraph)
 }
+
+pub fn activate_heap_profile() -> Result<()> {
+    ensure!(is_prof_enabled()?, ProfilingNotEnabledSnafu);
+    unsafe {
+        tikv_jemalloc_ctl::raw::update(PROF_ACTIVE, true).context(ActivateProfSnafu)?;
+    }
+    Ok(())
+}
+
+pub fn deactivate_heap_profile() -> Result<()> {
+    ensure!(is_prof_enabled()?, ProfilingNotEnabledSnafu);
+    unsafe {
+        tikv_jemalloc_ctl::raw::update(PROF_ACTIVE, false).context(DeactivateProfSnafu)?;
+    }
+    Ok(())
+}
+
+pub fn is_heap_profile_active() -> Result<bool> {
+    unsafe { Ok(tikv_jemalloc_ctl::raw::read::<bool>(PROF_ACTIVE).context(ReadProfActiveSnafu)?) }
+}
+
 fn is_prof_enabled() -> Result<bool> {
    // safety: OPT_PROF variable, if present, is always a boolean value.
    Ok(unsafe { tikv_jemalloc_ctl::raw::read::<bool>(OPT_PROF).context(ReadOptProfSnafu)? })
--- a/src/common/mem-prof/src/jemalloc/error.rs
+++ b/src/common/mem-prof/src/jemalloc/error.rs
@@ -53,6 +53,24 @@ pub enum Error {
        #[snafu(source)]
        error: tikv_jemalloc_ctl::Error,
    },
+
+    #[snafu(display("Failed to activate heap profiling"))]
+    ActivateProf {
+        #[snafu(source)]
+        error: tikv_jemalloc_ctl::Error,
+    },
+
+    #[snafu(display("Failed to deactivate heap profiling"))]
+    DeactivateProf {
+        #[snafu(source)]
+        error: tikv_jemalloc_ctl::Error,
+    },
+
+    #[snafu(display("Failed to read heap profiling status"))]
+    ReadProfActive {
+        #[snafu(source)]
+        error: tikv_jemalloc_ctl::Error,
+    },
 }

 impl ErrorExt for Error {
@@ -63,6 +81,9 @@ impl ErrorExt for Error {
            Error::BuildTempPath { .. } => StatusCode::Internal,
            Error::OpenTempFile { .. } => StatusCode::StorageUnavailable,
            Error::DumpProfileData { .. } => StatusCode::StorageUnavailable,
+            Error::ActivateProf { .. } => StatusCode::Internal,
+            Error::DeactivateProf { .. } => StatusCode::Internal,
+            Error::ReadProfActive { .. } => StatusCode::Internal,
        }
    }

--- a/src/common/mem-prof/src/lib.rs
+++ b/src/common/mem-prof/src/lib.rs
@@ -17,7 +17,10 @@ pub mod error;
 #[cfg(not(windows))]
 mod jemalloc;
 #[cfg(not(windows))]
-pub use jemalloc::{dump_flamegraph, dump_pprof, dump_profile};
+pub use jemalloc::{
+    activate_heap_profile, deactivate_heap_profile, dump_flamegraph, dump_pprof, dump_profile,
+    is_heap_profile_active,
+};

 #[cfg(windows)]
 pub async fn dump_profile() -> error::Result<Vec<u8>> {
@@ -33,3 +36,18 @@ pub async fn dump_pprof() -> error::Result<Vec<u8>> {
 pub async fn dump_flamegraph() -> error::Result<Vec<u8>> {
    error::ProfilingNotSupportedSnafu.fail()
 }
+
+#[cfg(windows)]
+pub fn activate_heap_profile() -> error::Result<()> {
+    error::ProfilingNotSupportedSnafu.fail()
+}
+
+#[cfg(windows)]
+pub fn deactivate_heap_profile() -> error::Result<()> {
+    error::ProfilingNotSupportedSnafu.fail()
+}
+
+#[cfg(windows)]
+pub fn is_heap_profile_active() -> error::Result<bool> {
+    error::ProfilingNotSupportedSnafu.fail()
+}
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -6,7 +6,16 @@ license.workspace = true

 [features]
 testing = []
-pg_kvbackend = ["dep:tokio-postgres", "dep:backon", "dep:deadpool-postgres", "dep:deadpool"]
+pg_kvbackend = [
+    "dep:tokio-postgres",
+    "dep:backon",
+    "dep:deadpool-postgres",
+    "dep:deadpool",
+    "dep:tokio-postgres-rustls",
+    "dep:rustls-pemfile",
+    "dep:rustls-native-certs",
+    "dep:rustls",
+]
 mysql_kvbackend = ["dep:sqlx", "dep:backon"]
 enterprise = []

@@ -57,6 +66,9 @@ prost.workspace = true
 rand.workspace = true
 regex.workspace = true
 rskafka.workspace = true
+rustls = { workspace = true, default-features = false, features = ["ring", "logging", "std", "tls12"], optional = true }
+rustls-native-certs = { version = "0.7", optional = true }
+rustls-pemfile = { version = "2.0", optional = true }
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
@@ -68,7 +80,9 @@ strum.workspace = true
 table = { workspace = true, features = ["testing"] }
 tokio.workspace = true
 tokio-postgres = { workspace = true, optional = true }
+tokio-postgres-rustls = { version = "0.12", optional = true }
 tonic.workspace = true
+tracing.workspace = true
 typetag.workspace = true

 [dev-dependencies]
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -15,6 +15,7 @@
 use std::collections::HashMap;
 use std::sync::Arc;

+use common_telemetry::info;
 use futures::future::BoxFuture;
 use moka::future::Cache;
 use moka::ops::compute::Op;
@@ -89,6 +90,12 @@ fn init_factory(table_flow_manager: TableFlowManagerRef) -> Initializer<TableId,
                // we have a corresponding cache invalidation mechanism to invalidate `(Key, EmptyHashSet)`.
                .map(Arc::new)
                .map(Some)
+                .inspect(|set| {
+                    info!(
+                        "Initialized table_flownode cache for table_id: {}, set: {:?}",
+                        table_id, set
+                    );
+                })
        })
    })
 }
@@ -167,6 +174,13 @@ fn invalidator<'a>(
        match ident {
            CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
            CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
+            CacheIdent::FlowNodeAddressChange(node_id) => {
+                info!(
+                    "Invalidate flow node cache for node_id in table_flownode: {}",
+                    node_id
+                );
+                cache.invalidate_all();
+            }
            _ => {}
        }
        Ok(())
@@ -174,7 +188,10 @@ fn invalidator<'a>(
 }

 fn filter(ident: &CacheIdent) -> bool {
-    matches!(ident, CacheIdent::CreateFlow(_) | CacheIdent::DropFlow(_))
+    matches!(
+        ident,
+        CacheIdent::CreateFlow(_) | CacheIdent::DropFlow(_) | CacheIdent::FlowNodeAddressChange(_)
+    )
 }

 #[cfg(test)]
--- a/src/common/meta/src/cache_invalidator.rs
+++ b/src/common/meta/src/cache_invalidator.rs
@@ -22,6 +22,7 @@ use crate::key::flow::flow_name::FlowNameKey;
 use crate::key::flow::flow_route::FlowRouteKey;
 use crate::key::flow::flownode_flow::FlownodeFlowKey;
 use crate::key::flow::table_flow::TableFlowKey;
+use crate::key::node_address::NodeAddressKey;
 use crate::key::schema_name::SchemaNameKey;
 use crate::key::table_info::TableInfoKey;
 use crate::key::table_name::TableNameKey;
@@ -53,6 +54,10 @@ pub struct Context {
 #[async_trait::async_trait]
 pub trait CacheInvalidator: Send + Sync {
    async fn invalidate(&self, ctx: &Context, caches: &[CacheIdent]) -> Result<()>;
+
+    fn name(&self) -> &'static str {
+        std::any::type_name::<Self>()
+    }
 }

 pub type CacheInvalidatorRef = Arc<dyn CacheInvalidator>;
@@ -137,6 +142,13 @@ where
                    let key = FlowInfoKey::new(*flow_id);
                    self.invalidate_key(&key.to_bytes()).await;
                }
+                CacheIdent::FlowNodeAddressChange(node_id) => {
+                    // other caches doesn't need to be invalidated
+                    // since this is only for flownode address change not id change
+                    common_telemetry::info!("Invalidate flow node cache for node_id: {}", node_id);
+                    let key = NodeAddressKey::with_flownode(*node_id);
+                    self.invalidate_key(&key.to_bytes()).await;
+                }
            }
        }
        Ok(())
--- a/src/common/meta/src/datanode.rs
+++ b/src/common/meta/src/datanode.rs
@@ -93,6 +93,8 @@ pub struct RegionStat {
    pub manifest_size: u64,
    /// The size of the SST data files in bytes.
    pub sst_size: u64,
+    /// The num of the SST data files.
+    pub sst_num: u64,
    /// The size of the SST index files in bytes.
    pub index_size: u64,
    /// The manifest infoof the region.
@@ -173,8 +175,8 @@ impl RegionStat {
        std::mem::size_of::<RegionId>() +
        // rcus, wcus, approximate_bytes, num_rows
        std::mem::size_of::<i64>() * 4 +
-        // memtable_size, manifest_size, sst_size, index_size
-        std::mem::size_of::<u64>() * 4 +
+        // memtable_size, manifest_size, sst_size, sst_num, index_size
+        std::mem::size_of::<u64>() * 5 +
        // engine
        std::mem::size_of::<String>() + self.engine.capacity() +
        // region_manifest
@@ -275,6 +277,7 @@ impl From<&api::v1::meta::RegionStat> for RegionStat {
            memtable_size: region_stat.memtable_size,
            manifest_size: region_stat.manifest_size,
            sst_size: region_stat.sst_size,
+            sst_num: region_stat.sst_num,
            index_size: region_stat.index_size,
            region_manifest: region_stat.manifest.into(),
            data_topic_latest_entry_id: region_stat.data_topic_latest_entry_id,
--- a/src/common/meta/src/ddl.rs
+++ b/src/common/meta/src/ddl.rs
@@ -15,25 +15,17 @@
 use std::collections::HashMap;
 use std::sync::Arc;

-use api::v1::meta::ProcedureDetailResponse;
-use common_telemetry::tracing_context::W3cTrace;
 use store_api::storage::{RegionId, RegionNumber, TableId};

 use crate::cache_invalidator::CacheInvalidatorRef;
 use crate::ddl::flow_meta::FlowMetadataAllocatorRef;
 use crate::ddl::table_meta::TableMetadataAllocatorRef;
-use crate::error::{Result, UnsupportedSnafu};
 use crate::key::flow::FlowMetadataManagerRef;
 use crate::key::table_route::PhysicalTableRouteValue;
 use crate::key::TableMetadataManagerRef;
 use crate::node_manager::NodeManagerRef;
 use crate::region_keeper::MemoryRegionKeeperRef;
 use crate::region_registry::LeaderRegionRegistryRef;
-use crate::rpc::ddl::{SubmitDdlTaskRequest, SubmitDdlTaskResponse};
-use crate::rpc::procedure::{
-    AddRegionFollowerRequest, MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse,
-    RemoveRegionFollowerRequest,
-};
 use crate::DatanodeId;

 pub mod alter_database;
@@ -44,13 +36,13 @@ pub mod create_flow;
 pub mod create_logical_tables;
 pub mod create_table;
 mod create_table_template;
+pub(crate) use create_table_template::{build_template_from_raw_table_info, CreateRequestBuilder};
 pub mod create_view;
 pub mod drop_database;
 pub mod drop_flow;
 pub mod drop_table;
 pub mod drop_view;
 pub mod flow_meta;
-mod physical_table_metadata;
 pub mod table_meta;
 #[cfg(any(test, feature = "testing"))]
 pub mod test_util;
@@ -59,64 +51,6 @@ pub(crate) mod tests;
 pub mod truncate_table;
 pub mod utils;

-#[derive(Debug, Default)]
-pub struct ExecutorContext {
-    pub tracing_context: Option<W3cTrace>,
-}
-
-/// The procedure executor that accepts ddl, region migration task etc.
-#[async_trait::async_trait]
-pub trait ProcedureExecutor: Send + Sync {
-    /// Submit a ddl task
-    async fn submit_ddl_task(
-        &self,
-        ctx: &ExecutorContext,
-        request: SubmitDdlTaskRequest,
-    ) -> Result<SubmitDdlTaskResponse>;
-
-    /// Add a region follower
-    async fn add_region_follower(
-        &self,
-        _ctx: &ExecutorContext,
-        _request: AddRegionFollowerRequest,
-    ) -> Result<()> {
-        UnsupportedSnafu {
-            operation: "add_region_follower",
-        }
-        .fail()
-    }
-
-    /// Remove a region follower
-    async fn remove_region_follower(
-        &self,
-        _ctx: &ExecutorContext,
-        _request: RemoveRegionFollowerRequest,
-    ) -> Result<()> {
-        UnsupportedSnafu {
-            operation: "remove_region_follower",
-        }
-        .fail()
-    }
-
-    /// Submit a region migration task
-    async fn migrate_region(
-        &self,
-        ctx: &ExecutorContext,
-        request: MigrateRegionRequest,
-    ) -> Result<MigrateRegionResponse>;
-
-    /// Query the procedure state by its id
-    async fn query_procedure_state(
-        &self,
-        ctx: &ExecutorContext,
-        pid: &str,
-    ) -> Result<ProcedureStateResponse>;
-
-    async fn list_procedures(&self, ctx: &ExecutorContext) -> Result<ProcedureDetailResponse>;
-}
-
-pub type ProcedureExecutorRef = Arc<dyn ProcedureExecutor>;
-
 /// Metadata allocated to a table.
 #[derive(Default)]
 pub struct TableMetadata {
--- a/src/common/meta/src/ddl/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables.rs
@@ -12,20 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-mod check;
-mod metadata;
-mod region_request;
-mod table_cache_keys;
+mod executor;
 mod update_metadata;
+mod validator;

 use api::region::RegionResponse;
 use async_trait::async_trait;
 use common_catalog::format_full_table_name;
 use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
 use common_procedure::{Context, LockKey, Procedure, Status};
-use common_telemetry::{error, info, warn};
-use futures_util::future;
-pub use region_request::make_alter_region_request;
+use common_telemetry::{debug, error, info, warn};
+pub use executor::make_alter_region_request;
 use serde::{Deserialize, Serialize};
 use snafu::ResultExt;
 use store_api::metadata::ColumnMetadata;
@@ -33,10 +30,12 @@ use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY;
 use strum::AsRefStr;
 use table::metadata::TableId;

-use crate::ddl::utils::{
-    add_peer_context_if_needed, extract_column_metadatas, map_to_procedure_error,
-    sync_follower_regions,
+use crate::cache_invalidator::Context as CacheContext;
+use crate::ddl::alter_logical_tables::executor::AlterLogicalTablesExecutor;
+use crate::ddl::alter_logical_tables::validator::{
+    retain_unskipped, AlterLogicalTableValidator, ValidatorResult,
 };
+use crate::ddl::utils::{extract_column_metadatas, map_to_procedure_error, sync_follower_regions};
 use crate::ddl::DdlContext;
 use crate::error::Result;
 use crate::instruction::CacheIdent;
@@ -46,13 +45,38 @@ use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
 use crate::metrics;
 use crate::rpc::ddl::AlterTableTask;
-use crate::rpc::router::{find_leaders, RegionRoute};
+use crate::rpc::router::RegionRoute;

 pub struct AlterLogicalTablesProcedure {
    pub context: DdlContext,
    pub data: AlterTablesData,
 }

+/// Builds the validator from the [`AlterTablesData`].
+fn build_validator_from_alter_table_data<'a>(
+    data: &'a AlterTablesData,
+) -> AlterLogicalTableValidator<'a> {
+    let phsycial_table_id = data.physical_table_id;
+    let alters = data
+        .tasks
+        .iter()
+        .map(|task| &task.alter_table)
+        .collect::<Vec<_>>();
+    AlterLogicalTableValidator::new(phsycial_table_id, alters)
+}
+
+/// Builds the executor from the [`AlterTablesData`].
+fn build_executor_from_alter_expr<'a>(data: &'a AlterTablesData) -> AlterLogicalTablesExecutor<'a> {
+    debug_assert_eq!(data.tasks.len(), data.table_info_values.len());
+    let alters = data
+        .tasks
+        .iter()
+        .zip(data.table_info_values.iter())
+        .map(|(task, table_info)| (table_info.table_info.ident.table_id, &task.alter_table))
+        .collect::<Vec<_>>();
+    AlterLogicalTablesExecutor::new(alters)
+}
+
 impl AlterLogicalTablesProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::AlterLogicalTables";

@@ -82,35 +106,44 @@ impl AlterLogicalTablesProcedure {
    }

    pub(crate) async fn on_prepare(&mut self) -> Result<Status> {
-        // Checks all the tasks
-        self.check_input_tasks()?;
-        // Fills the table info values
-        self.fill_table_info_values().await?;
-        // Checks the physical table, must after [fill_table_info_values]
-        self.check_physical_table().await?;
-        // Fills the physical table info
-        self.fill_physical_table_info().await?;
-        // Filter the finished tasks
-        let finished_tasks = self.check_finished_tasks()?;
-        let already_finished_count = finished_tasks
-            .iter()
-            .map(|x| if *x { 1 } else { 0 })
-            .sum::<usize>();
-        let apply_tasks_count = self.data.tasks.len();
-        if already_finished_count == apply_tasks_count {
+        let validator = build_validator_from_alter_table_data(&self.data);
+        let ValidatorResult {
+            num_skipped,
+            skip_alter,
+            table_info_values,
+            physical_table_info,
+            physical_table_route,
+        } = validator
+            .validate(&self.context.table_metadata_manager)
+            .await?;
+
+        let num_tasks = self.data.tasks.len();
+        if num_skipped == num_tasks {
            info!("All the alter tasks are finished, will skip the procedure.");
+            let cache_ident_keys = AlterLogicalTablesExecutor::build_cache_ident_keys(
+                &physical_table_info,
+                &table_info_values
+                    .iter()
+                    .map(|v| v.get_inner_ref())
+                    .collect::<Vec<_>>(),
+            );
+            self.data.table_cache_keys_to_invalidate = cache_ident_keys;
            // Re-invalidate the table cache
            self.data.state = AlterTablesState::InvalidateTableCache;
            return Ok(Status::executing(true));
-        } else if already_finished_count > 0 {
+        } else if num_skipped > 0 {
            info!(
                "There are {} alter tasks, {} of them were already finished.",
-                apply_tasks_count, already_finished_count
+                num_tasks, num_skipped
            );
        }
-        self.filter_task(&finished_tasks)?;

-        // Next state
+        // Updates the procedure state.
+        retain_unskipped(&mut self.data.tasks, &skip_alter);
+        self.data.physical_table_info = Some(physical_table_info);
+        self.data.physical_table_route = Some(physical_table_route);
+        self.data.table_info_values = table_info_values;
+        debug_assert_eq!(self.data.tasks.len(), self.data.table_info_values.len());
        self.data.state = AlterTablesState::SubmitAlterRegionRequests;
        Ok(Status::executing(true))
    }
@@ -118,25 +151,13 @@ impl AlterLogicalTablesProcedure {
    pub(crate) async fn on_submit_alter_region_requests(&mut self) -> Result<Status> {
        // Safety: we have checked the state in on_prepare
        let physical_table_route = &self.data.physical_table_route.as_ref().unwrap();
-        let leaders = find_leaders(&physical_table_route.region_routes);
-        let mut alter_region_tasks = Vec::with_capacity(leaders.len());
-
-        for peer in leaders {
-            let requester = self.context.node_manager.datanode(&peer).await;
-            let request = self.make_request(&peer, &physical_table_route.region_routes)?;
-
-            alter_region_tasks.push(async move {
-                requester
-                    .handle(request)
-                    .await
-                    .map_err(add_peer_context_if_needed(peer))
-            });
-        }
-
-        let mut results = future::join_all(alter_region_tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+        let executor = build_executor_from_alter_expr(&self.data);
+        let mut results = executor
+            .on_alter_regions(
+                &self.context.node_manager,
+                &physical_table_route.region_routes,
+            )
+            .await?;

        if let Some(column_metadatas) =
            extract_column_metadatas(&mut results, ALTER_PHYSICAL_EXTENSION_KEY)?
@@ -177,7 +198,18 @@ impl AlterLogicalTablesProcedure {
        self.update_physical_table_metadata().await?;
        self.update_logical_tables_metadata().await?;

-        self.data.build_cache_keys_to_invalidate();
+        let logical_table_info_values = self
+            .data
+            .table_info_values
+            .iter()
+            .map(|v| v.get_inner_ref())
+            .collect::<Vec<_>>();
+
+        let cache_ident_keys = AlterLogicalTablesExecutor::build_cache_ident_keys(
+            self.data.physical_table_info.as_ref().unwrap(),
+            &logical_table_info_values,
+        );
+        self.data.table_cache_keys_to_invalidate = cache_ident_keys;
        self.data.clear_metadata_fields();

        self.data.state = AlterTablesState::InvalidateTableCache;
@@ -187,9 +219,16 @@ impl AlterLogicalTablesProcedure {
    pub(crate) async fn on_invalidate_table_cache(&mut self) -> Result<Status> {
        let to_invalidate = &self.data.table_cache_keys_to_invalidate;

+        let ctx = CacheContext {
+            subject: Some(format!(
+                "Invalidate table cache by altering logical tables, physical_table_id: {}",
+                self.data.physical_table_id,
+            )),
+        };
+
        self.context
            .cache_invalidator
-            .invalidate(&Default::default(), to_invalidate)
+            .invalidate(&ctx, to_invalidate)
            .await?;
        Ok(Status::done())
    }
@@ -209,6 +248,10 @@ impl Procedure for AlterLogicalTablesProcedure {
        let _timer = metrics::METRIC_META_PROCEDURE_ALTER_TABLE
            .with_label_values(&[step])
            .start_timer();
+        debug!(
+            "Executing alter logical tables procedure, state: {:?}",
+            state
+        );

        match state {
            AlterTablesState::Prepare => self.on_prepare().await,
--- a/src/common/meta/src/ddl/alter_logical_tables/check.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/check.rs
@@ -1,136 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashSet;
-
-use api::v1::alter_table_expr::Kind;
-use snafu::{ensure, OptionExt};
-
-use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
-use crate::error::{AlterLogicalTablesInvalidArgumentsSnafu, Result};
-use crate::key::table_info::TableInfoValue;
-use crate::key::table_route::TableRouteValue;
-use crate::rpc::ddl::AlterTableTask;
-
-impl AlterLogicalTablesProcedure {
-    pub(crate) fn check_input_tasks(&self) -> Result<()> {
-        self.check_schema()?;
-        self.check_alter_kind()?;
-        Ok(())
-    }
-
-    pub(crate) async fn check_physical_table(&self) -> Result<()> {
-        let table_route_manager = self.context.table_metadata_manager.table_route_manager();
-        let table_ids = self
-            .data
-            .table_info_values
-            .iter()
-            .map(|v| v.table_info.ident.table_id)
-            .collect::<Vec<_>>();
-        let table_routes = table_route_manager
-            .table_route_storage()
-            .batch_get(&table_ids)
-            .await?;
-        let physical_table_id = self.data.physical_table_id;
-        let is_same_physical_table = table_routes.iter().all(|r| {
-            if let Some(TableRouteValue::Logical(r)) = r {
-                r.physical_table_id() == physical_table_id
-            } else {
-                false
-            }
-        });
-
-        ensure!(
-            is_same_physical_table,
-            AlterLogicalTablesInvalidArgumentsSnafu {
-                err_msg: "All the tasks should have the same physical table id"
-            }
-        );
-
-        Ok(())
-    }
-
-    pub(crate) fn check_finished_tasks(&self) -> Result<Vec<bool>> {
-        let task = &self.data.tasks;
-        let table_info_values = &self.data.table_info_values;
-
-        Ok(task
-            .iter()
-            .zip(table_info_values.iter())
-            .map(|(task, table)| Self::check_finished_task(task, table))
-            .collect())
-    }
-
-    // Checks if the schemas of the tasks are the same
-    fn check_schema(&self) -> Result<()> {
-        let is_same_schema = self.data.tasks.windows(2).all(|pair| {
-            pair[0].alter_table.catalog_name == pair[1].alter_table.catalog_name
-                && pair[0].alter_table.schema_name == pair[1].alter_table.schema_name
-        });
-
-        ensure!(
-            is_same_schema,
-            AlterLogicalTablesInvalidArgumentsSnafu {
-                err_msg: "Schemas of the tasks are not the same"
-            }
-        );
-
-        Ok(())
-    }
-
-    fn check_alter_kind(&self) -> Result<()> {
-        for task in &self.data.tasks {
-            let kind = task.alter_table.kind.as_ref().context(
-                AlterLogicalTablesInvalidArgumentsSnafu {
-                    err_msg: "Alter kind is missing",
-                },
-            )?;
-            let Kind::AddColumns(_) = kind else {
-                return AlterLogicalTablesInvalidArgumentsSnafu {
-                    err_msg: "Only support add columns operation",
-                }
-                .fail();
-            };
-        }
-
-        Ok(())
-    }
-
-    fn check_finished_task(task: &AlterTableTask, table: &TableInfoValue) -> bool {
-        let columns = table
-            .table_info
-            .meta
-            .schema
-            .column_schemas
-            .iter()
-            .map(|c| &c.name)
-            .collect::<HashSet<_>>();
-
-        let Some(kind) = task.alter_table.kind.as_ref() else {
-            return true; // Never get here since we have checked it in `check_alter_kind`
-        };
-        let Kind::AddColumns(add_columns) = kind else {
-            return true; // Never get here since we have checked it in `check_alter_kind`
-        };
-
-        // We only check that all columns have been finished. That is to say,
-        // if one part is finished but another part is not, it will be considered
-        // unfinished.
-        add_columns
-            .add_columns
-            .iter()
-            .map(|add_column| add_column.column_def.as_ref().map(|c| &c.name))
-            .all(|column| column.map(|c| columns.contains(c)).unwrap_or(false))
-    }
-}
--- a/src/common/meta/src/ddl/alter_logical_tables/executor.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/executor.rs
@@ -0,0 +1,216 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use api::region::RegionResponse;
+use api::v1::alter_table_expr::Kind;
+use api::v1::region::{
+    alter_request, region_request, AddColumn, AddColumns, AlterRequest, AlterRequests,
+    RegionColumnDef, RegionRequest, RegionRequestHeader,
+};
+use api::v1::{self, AlterTableExpr};
+use common_telemetry::tracing_context::TracingContext;
+use common_telemetry::{debug, warn};
+use futures::future;
+use store_api::metadata::ColumnMetadata;
+use store_api::storage::{RegionId, RegionNumber, TableId};
+
+use crate::ddl::utils::{add_peer_context_if_needed, raw_table_info};
+use crate::error::Result;
+use crate::instruction::CacheIdent;
+use crate::key::table_info::TableInfoValue;
+use crate::key::{DeserializedValueWithBytes, RegionDistribution, TableMetadataManagerRef};
+use crate::node_manager::NodeManagerRef;
+use crate::rpc::router::{find_leaders, region_distribution, RegionRoute};
+
+/// [AlterLogicalTablesExecutor] performs:
+/// - Alters logical regions on the datanodes.
+/// - Updates table metadata for alter table operation.
+pub struct AlterLogicalTablesExecutor<'a> {
+    /// The alter table expressions.
+    ///
+    /// The first element is the logical table id, the second element is the alter table expression.
+    alters: Vec<(TableId, &'a AlterTableExpr)>,
+}
+
+impl<'a> AlterLogicalTablesExecutor<'a> {
+    pub fn new(alters: Vec<(TableId, &'a AlterTableExpr)>) -> Self {
+        Self { alters }
+    }
+
+    /// Alters logical regions on the datanodes.
+    pub(crate) async fn on_alter_regions(
+        &self,
+        node_manager: &NodeManagerRef,
+        region_routes: &[RegionRoute],
+    ) -> Result<Vec<RegionResponse>> {
+        let region_distribution = region_distribution(region_routes);
+        let leaders = find_leaders(region_routes)
+            .into_iter()
+            .map(|p| (p.id, p))
+            .collect::<HashMap<_, _>>();
+        let mut alter_region_tasks = Vec::with_capacity(leaders.len());
+        for (datanode_id, region_role_set) in region_distribution {
+            if region_role_set.leader_regions.is_empty() {
+                continue;
+            }
+            // Safety: must exists.
+            let peer = leaders.get(&datanode_id).unwrap();
+            let requester = node_manager.datanode(peer).await;
+            let requests = self.make_alter_region_request(&region_role_set.leader_regions);
+            let requester = requester.clone();
+            let peer = peer.clone();
+
+            debug!("Sending alter region requests to datanode {}", peer);
+            alter_region_tasks.push(async move {
+                requester
+                    .handle(make_request(requests))
+                    .await
+                    .map_err(add_peer_context_if_needed(peer))
+            });
+        }
+
+        future::join_all(alter_region_tasks)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<_>>>()
+    }
+
+    fn make_alter_region_request(&self, region_numbers: &[RegionNumber]) -> AlterRequests {
+        let mut requests = Vec::with_capacity(region_numbers.len() * self.alters.len());
+        for (table_id, alter) in self.alters.iter() {
+            for region_number in region_numbers {
+                let region_id = RegionId::new(*table_id, *region_number);
+                let request = make_alter_region_request(region_id, alter);
+                requests.push(request);
+            }
+        }
+
+        AlterRequests { requests }
+    }
+
+    /// Updates table metadata for alter table operation.
+    ///
+    /// ## Panic:
+    /// - If the region distribution is not set when updating table metadata.
+    pub(crate) async fn on_alter_metadata(
+        physical_table_id: TableId,
+        table_metadata_manager: &TableMetadataManagerRef,
+        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
+        region_distribution: RegionDistribution,
+        physical_columns: &[ColumnMetadata],
+    ) -> Result<()> {
+        if physical_columns.is_empty() {
+            warn!("No physical columns found, leaving the physical table's schema unchanged when altering logical tables");
+            return Ok(());
+        }
+
+        let table_ref = current_table_info_value.table_ref();
+        let table_id = physical_table_id;
+
+        // Generates new table info
+        let old_raw_table_info = current_table_info_value.table_info.clone();
+        let new_raw_table_info =
+            raw_table_info::build_new_physical_table_info(old_raw_table_info, physical_columns);
+
+        debug!(
+            "Starting update table: {} metadata, table_id: {}, new table info: {:?}",
+            table_ref, table_id, new_raw_table_info
+        );
+
+        table_metadata_manager
+            .update_table_info(
+                current_table_info_value,
+                Some(region_distribution),
+                new_raw_table_info,
+            )
+            .await?;
+
+        Ok(())
+    }
+
+    /// Builds the cache ident keys for the alter logical tables.
+    ///
+    /// The cache ident keys are:
+    /// - The table id of the logical tables.
+    /// - The table name of the logical tables.
+    /// - The table id of the physical table.
+    pub(crate) fn build_cache_ident_keys(
+        physical_table_info: &TableInfoValue,
+        logical_table_info_values: &[&TableInfoValue],
+    ) -> Vec<CacheIdent> {
+        let mut cache_keys = Vec::with_capacity(logical_table_info_values.len() * 2 + 2);
+        cache_keys.extend(logical_table_info_values.iter().flat_map(|table| {
+            vec![
+                CacheIdent::TableId(table.table_info.ident.table_id),
+                CacheIdent::TableName(table.table_name()),
+            ]
+        }));
+        cache_keys.push(CacheIdent::TableId(
+            physical_table_info.table_info.ident.table_id,
+        ));
+        cache_keys.push(CacheIdent::TableName(physical_table_info.table_name()));
+
+        cache_keys
+    }
+}
+
+fn make_request(alter_requests: AlterRequests) -> RegionRequest {
+    RegionRequest {
+        header: Some(RegionRequestHeader {
+            tracing_context: TracingContext::from_current_span().to_w3c(),
+            ..Default::default()
+        }),
+        body: Some(region_request::Body::Alters(alter_requests)),
+    }
+}
+
+/// Makes an alter region request.
+pub fn make_alter_region_request(
+    region_id: RegionId,
+    alter_table_expr: &AlterTableExpr,
+) -> AlterRequest {
+    let region_id = region_id.as_u64();
+    let kind = match &alter_table_expr.kind {
+        Some(Kind::AddColumns(add_columns)) => Some(alter_request::Kind::AddColumns(
+            to_region_add_columns(add_columns),
+        )),
+        _ => unreachable!(), // Safety: we have checked the kind in check_input_tasks
+    };
+
+    AlterRequest {
+        region_id,
+        schema_version: 0,
+        kind,
+    }
+}
+
+fn to_region_add_columns(add_columns: &v1::AddColumns) -> AddColumns {
+    let add_columns = add_columns
+        .add_columns
+        .iter()
+        .map(|add_column| {
+            let region_column_def = RegionColumnDef {
+                column_def: add_column.column_def.clone(),
+                ..Default::default() // other fields are not used in alter logical table
+            };
+            AddColumn {
+                column_def: Some(region_column_def),
+                ..Default::default() // other fields are not used in alter logical table
+            }
+        })
+        .collect();
+    AddColumns { add_columns }
+}
--- a/src/common/meta/src/ddl/alter_logical_tables/metadata.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/metadata.rs
@@ -1,158 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use common_catalog::format_full_table_name;
-use snafu::OptionExt;
-use table::metadata::TableId;
-
-use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
-use crate::error::{
-    AlterLogicalTablesInvalidArgumentsSnafu, Result, TableInfoNotFoundSnafu, TableNotFoundSnafu,
-    TableRouteNotFoundSnafu,
-};
-use crate::key::table_info::TableInfoValue;
-use crate::key::table_name::TableNameKey;
-use crate::key::table_route::TableRouteValue;
-use crate::key::DeserializedValueWithBytes;
-use crate::rpc::ddl::AlterTableTask;
-
-impl AlterLogicalTablesProcedure {
-    pub(crate) fn filter_task(&mut self, finished_tasks: &[bool]) -> Result<()> {
-        debug_assert_eq!(finished_tasks.len(), self.data.tasks.len());
-        debug_assert_eq!(finished_tasks.len(), self.data.table_info_values.len());
-        self.data.tasks = self
-            .data
-            .tasks
-            .drain(..)
-            .zip(finished_tasks.iter())
-            .filter_map(|(task, finished)| if *finished { None } else { Some(task) })
-            .collect();
-        self.data.table_info_values = self
-            .data
-            .table_info_values
-            .drain(..)
-            .zip(finished_tasks.iter())
-            .filter_map(|(table_info_value, finished)| {
-                if *finished {
-                    None
-                } else {
-                    Some(table_info_value)
-                }
-            })
-            .collect();
-
-        Ok(())
-    }
-
-    pub(crate) async fn fill_physical_table_info(&mut self) -> Result<()> {
-        let (physical_table_info, physical_table_route) = self
-            .context
-            .table_metadata_manager
-            .get_full_table_info(self.data.physical_table_id)
-            .await?;
-
-        let physical_table_info = physical_table_info.with_context(|| TableInfoNotFoundSnafu {
-            table: format!("table id - {}", self.data.physical_table_id),
-        })?;
-        let physical_table_route = physical_table_route
-            .context(TableRouteNotFoundSnafu {
-                table_id: self.data.physical_table_id,
-            })?
-            .into_inner();
-
-        self.data.physical_table_info = Some(physical_table_info);
-        let TableRouteValue::Physical(physical_table_route) = physical_table_route else {
-            return AlterLogicalTablesInvalidArgumentsSnafu {
-                err_msg: format!(
-                    "expected a physical table but got a logical table: {:?}",
-                    self.data.physical_table_id
-                ),
-            }
-            .fail();
-        };
-        self.data.physical_table_route = Some(physical_table_route);
-
-        Ok(())
-    }
-
-    pub(crate) async fn fill_table_info_values(&mut self) -> Result<()> {
-        let table_ids = self.get_all_table_ids().await?;
-        let table_info_values = self.get_all_table_info_values(&table_ids).await?;
-        debug_assert_eq!(table_info_values.len(), self.data.tasks.len());
-        self.data.table_info_values = table_info_values;
-
-        Ok(())
-    }
-
-    async fn get_all_table_info_values(
-        &self,
-        table_ids: &[TableId],
-    ) -> Result<Vec<DeserializedValueWithBytes<TableInfoValue>>> {
-        let table_info_manager = self.context.table_metadata_manager.table_info_manager();
-        let mut table_info_map = table_info_manager.batch_get_raw(table_ids).await?;
-        let mut table_info_values = Vec::with_capacity(table_ids.len());
-        for (table_id, task) in table_ids.iter().zip(self.data.tasks.iter()) {
-            let table_info_value =
-                table_info_map
-                    .remove(table_id)
-                    .with_context(|| TableInfoNotFoundSnafu {
-                        table: extract_table_name(task),
-                    })?;
-            table_info_values.push(table_info_value);
-        }
-
-        Ok(table_info_values)
-    }
-
-    async fn get_all_table_ids(&self) -> Result<Vec<TableId>> {
-        let table_name_manager = self.context.table_metadata_manager.table_name_manager();
-        let table_name_keys = self
-            .data
-            .tasks
-            .iter()
-            .map(|task| extract_table_name_key(task))
-            .collect();
-
-        let table_name_values = table_name_manager.batch_get(table_name_keys).await?;
-        let mut table_ids = Vec::with_capacity(table_name_values.len());
-        for (value, task) in table_name_values.into_iter().zip(self.data.tasks.iter()) {
-            let table_id = value
-                .with_context(|| TableNotFoundSnafu {
-                    table_name: extract_table_name(task),
-                })?
-                .table_id();
-            table_ids.push(table_id);
-        }
-
-        Ok(table_ids)
-    }
-}
-
-#[inline]
-fn extract_table_name(task: &AlterTableTask) -> String {
-    format_full_table_name(
-        &task.alter_table.catalog_name,
-        &task.alter_table.schema_name,
-        &task.alter_table.table_name,
-    )
-}
-
-#[inline]
-fn extract_table_name_key(task: &AlterTableTask) -> TableNameKey {
-    TableNameKey::new(
-        &task.alter_table.catalog_name,
-        &task.alter_table.schema_name,
-        &task.alter_table.table_name,
-    )
-}
--- a/src/common/meta/src/ddl/alter_logical_tables/region_request.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/region_request.rs
@@ -1,113 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use api::v1::alter_table_expr::Kind;
-use api::v1::region::{
-    alter_request, region_request, AddColumn, AddColumns, AlterRequest, AlterRequests,
-    RegionColumnDef, RegionRequest, RegionRequestHeader,
-};
-use api::v1::{self, AlterTableExpr};
-use common_telemetry::tracing_context::TracingContext;
-use store_api::storage::RegionId;
-
-use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
-use crate::error::Result;
-use crate::peer::Peer;
-use crate::rpc::router::{find_leader_regions, RegionRoute};
-
-impl AlterLogicalTablesProcedure {
-    pub(crate) fn make_request(
-        &self,
-        peer: &Peer,
-        region_routes: &[RegionRoute],
-    ) -> Result<RegionRequest> {
-        let alter_requests = self.make_alter_region_requests(peer, region_routes)?;
-        let request = RegionRequest {
-            header: Some(RegionRequestHeader {
-                tracing_context: TracingContext::from_current_span().to_w3c(),
-                ..Default::default()
-            }),
-            body: Some(region_request::Body::Alters(alter_requests)),
-        };
-
-        Ok(request)
-    }
-
-    fn make_alter_region_requests(
-        &self,
-        peer: &Peer,
-        region_routes: &[RegionRoute],
-    ) -> Result<AlterRequests> {
-        let tasks = &self.data.tasks;
-        let regions_on_this_peer = find_leader_regions(region_routes, peer);
-        let mut requests = Vec::with_capacity(tasks.len() * regions_on_this_peer.len());
-        for (task, table) in self
-            .data
-            .tasks
-            .iter()
-            .zip(self.data.table_info_values.iter())
-        {
-            for region_number in &regions_on_this_peer {
-                let region_id = RegionId::new(table.table_info.ident.table_id, *region_number);
-                let request = make_alter_region_request(
-                    region_id,
-                    &task.alter_table,
-                    table.table_info.ident.version,
-                );
-                requests.push(request);
-            }
-        }
-
-        Ok(AlterRequests { requests })
-    }
-}
-
-/// Makes an alter region request.
-pub fn make_alter_region_request(
-    region_id: RegionId,
-    alter_table_expr: &AlterTableExpr,
-    schema_version: u64,
-) -> AlterRequest {
-    let region_id = region_id.as_u64();
-    let kind = match &alter_table_expr.kind {
-        Some(Kind::AddColumns(add_columns)) => Some(alter_request::Kind::AddColumns(
-            to_region_add_columns(add_columns),
-        )),
-        _ => unreachable!(), // Safety: we have checked the kind in check_input_tasks
-    };
-
-    AlterRequest {
-        region_id,
-        schema_version,
-        kind,
-    }
-}
-
-fn to_region_add_columns(add_columns: &v1::AddColumns) -> AddColumns {
-    let add_columns = add_columns
-        .add_columns
-        .iter()
-        .map(|add_column| {
-            let region_column_def = RegionColumnDef {
-                column_def: add_column.column_def.clone(),
-                ..Default::default() // other fields are not used in alter logical table
-            };
-            AddColumn {
-                column_def: Some(region_column_def),
-                ..Default::default() // other fields are not used in alter logical table
-            }
-        })
-        .collect();
-    AddColumns { add_columns }
-}
--- a/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs
@@ -1,50 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use table::metadata::RawTableInfo;
-use table::table_name::TableName;
-
-use crate::ddl::alter_logical_tables::AlterTablesData;
-use crate::instruction::CacheIdent;
-
-impl AlterTablesData {
-    pub(crate) fn build_cache_keys_to_invalidate(&mut self) {
-        let mut cache_keys = self
-            .table_info_values
-            .iter()
-            .flat_map(|table| {
-                vec![
-                    CacheIdent::TableId(table.table_info.ident.table_id),
-                    CacheIdent::TableName(extract_table_name(&table.table_info)),
-                ]
-            })
-            .collect::<Vec<_>>();
-        cache_keys.push(CacheIdent::TableId(self.physical_table_id));
-        // Safety: physical_table_info already filled in previous steps
-        let physical_table_info = &self.physical_table_info.as_ref().unwrap().table_info;
-        cache_keys.push(CacheIdent::TableName(extract_table_name(
-            physical_table_info,
-        )));
-
-        self.table_cache_keys_to_invalidate = cache_keys;
-    }
-}
-
-fn extract_table_name(table_info: &RawTableInfo) -> TableName {
-    TableName::new(
-        &table_info.catalog_name,
-        &table_info.schema_name,
-        &table_info.name,
-    )
-}
--- a/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
@@ -13,66 +13,43 @@
 // limitations under the License.

 use common_grpc_expr::alter_expr_to_request;
-use common_telemetry::warn;
-use itertools::Itertools;
 use snafu::ResultExt;
 use table::metadata::{RawTableInfo, TableInfo};

+use crate::ddl::alter_logical_tables::executor::AlterLogicalTablesExecutor;
 use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
-use crate::ddl::physical_table_metadata;
+use crate::ddl::utils::table_info::batch_update_table_info_values;
 use crate::error;
 use crate::error::{ConvertAlterTableRequestSnafu, Result};
 use crate::key::table_info::TableInfoValue;
 use crate::key::DeserializedValueWithBytes;
 use crate::rpc::ddl::AlterTableTask;
+use crate::rpc::router::region_distribution;

 impl AlterLogicalTablesProcedure {
    pub(crate) async fn update_physical_table_metadata(&mut self) -> Result<()> {
-        if self.data.physical_columns.is_empty() {
-            warn!("No physical columns found, leaving the physical table's schema unchanged when altering logical tables");
-            return Ok(());
-        }
-
        // Safety: must exist.
        let physical_table_info = self.data.physical_table_info.as_ref().unwrap();
+        let physical_table_route = self.data.physical_table_route.as_ref().unwrap();
+        let region_distribution = region_distribution(&physical_table_route.region_routes);

-        // Generates new table info
-        let old_raw_table_info = physical_table_info.table_info.clone();
-        let new_raw_table_info = physical_table_metadata::build_new_physical_table_info(
-            old_raw_table_info,
+        // Updates physical table's metadata.
+        AlterLogicalTablesExecutor::on_alter_metadata(
+            self.data.physical_table_id,
+            &self.context.table_metadata_manager,
+            physical_table_info,
+            region_distribution,
            &self.data.physical_columns,
-        );
-
-        // Updates physical table's metadata, and we don't need to touch per-region settings.
-        self.context
-            .table_metadata_manager
-            .update_table_info(physical_table_info, None, new_raw_table_info)
-            .await?;
+        )
+        .await?;

        Ok(())
    }

    pub(crate) async fn update_logical_tables_metadata(&mut self) -> Result<()> {
        let table_info_values = self.build_update_metadata()?;
-        let manager = &self.context.table_metadata_manager;
-        let chunk_size = manager.batch_update_table_info_value_chunk_size();
-        if table_info_values.len() > chunk_size {
-            let chunks = table_info_values
-                .into_iter()
-                .chunks(chunk_size)
-                .into_iter()
-                .map(|check| check.collect::<Vec<_>>())
-                .collect::<Vec<_>>();
-            for chunk in chunks {
-                manager.batch_update_table_info_values(chunk).await?;
-            }
-        } else {
-            manager
-                .batch_update_table_info_values(table_info_values)
-                .await?;
-        }
-
-        Ok(())
+        batch_update_table_info_values(&self.context.table_metadata_manager, table_info_values)
+            .await
    }

    pub(crate) fn build_update_metadata(
@@ -100,9 +77,12 @@ impl AlterLogicalTablesProcedure {
        let table_info = TableInfo::try_from(table.table_info.clone())
            .context(error::ConvertRawTableInfoSnafu)?;
        let table_ref = task.table_ref();
-        let request =
-            alter_expr_to_request(table.table_info.ident.table_id, task.alter_table.clone())
-                .context(ConvertAlterTableRequestSnafu)?;
+        let request = alter_expr_to_request(
+            table.table_info.ident.table_id,
+            task.alter_table.clone(),
+            Some(&table_info.meta),
+        )
+        .context(ConvertAlterTableRequestSnafu)?;
        let new_meta = table_info
            .meta
            .builder_with_alter_kind(table_ref.table, &request.alter_kind)
--- a/src/common/meta/src/ddl/alter_logical_tables/validator.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/validator.rs
@@ -0,0 +1,279 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+
+use api::v1::alter_table_expr::Kind;
+use api::v1::AlterTableExpr;
+use snafu::{ensure, OptionExt};
+use store_api::storage::TableId;
+use table::table_reference::TableReference;
+
+use crate::ddl::utils::table_id::get_all_table_ids_by_names;
+use crate::ddl::utils::table_info::{
+    all_logical_table_routes_have_same_physical_id, get_all_table_info_values_by_table_ids,
+};
+use crate::error::{
+    AlterLogicalTablesInvalidArgumentsSnafu, Result, TableInfoNotFoundSnafu,
+    TableRouteNotFoundSnafu,
+};
+use crate::key::table_info::TableInfoValue;
+use crate::key::table_route::{PhysicalTableRouteValue, TableRouteManager, TableRouteValue};
+use crate::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
+
+/// [AlterLogicalTableValidator] validates the alter logical expressions.
+pub struct AlterLogicalTableValidator<'a> {
+    physical_table_id: TableId,
+    alters: Vec<&'a AlterTableExpr>,
+}
+
+impl<'a> AlterLogicalTableValidator<'a> {
+    pub fn new(physical_table_id: TableId, alters: Vec<&'a AlterTableExpr>) -> Self {
+        Self {
+            physical_table_id,
+            alters,
+        }
+    }
+
+    /// Validates all alter table expressions have the same schema and catalog.
+    fn validate_schema(&self) -> Result<()> {
+        let is_same_schema = self.alters.windows(2).all(|pair| {
+            pair[0].catalog_name == pair[1].catalog_name
+                && pair[0].schema_name == pair[1].schema_name
+        });
+
+        ensure!(
+            is_same_schema,
+            AlterLogicalTablesInvalidArgumentsSnafu {
+                err_msg: "Schemas of the alter table expressions are not the same"
+            }
+        );
+
+        Ok(())
+    }
+
+    /// Validates that all alter table expressions are of the supported kind.
+    /// Currently only supports `AddColumns` operations.
+    fn validate_alter_kind(&self) -> Result<()> {
+        for alter in &self.alters {
+            let kind = alter
+                .kind
+                .as_ref()
+                .context(AlterLogicalTablesInvalidArgumentsSnafu {
+                    err_msg: "Alter kind is missing",
+                })?;
+
+            let Kind::AddColumns(_) = kind else {
+                return AlterLogicalTablesInvalidArgumentsSnafu {
+                    err_msg: "Only support add columns operation",
+                }
+                .fail();
+            };
+        }
+
+        Ok(())
+    }
+
+    fn table_names(&self) -> Vec<TableReference> {
+        self.alters
+            .iter()
+            .map(|alter| {
+                TableReference::full(&alter.catalog_name, &alter.schema_name, &alter.table_name)
+            })
+            .collect()
+    }
+
+    /// Validates that the physical table info and route exist.
+    ///
+    /// This method performs the following validations:
+    /// 1. Retrieves the full table info and route for the given physical table id
+    /// 2. Ensures the table info and table route exists
+    /// 3. Verifies that the table route is actually a physical table route, not a logical one
+    ///
+    /// Returns a tuple containing the validated table info and physical table route.
+    async fn validate_physical_table(
+        &self,
+        table_metadata_manager: &TableMetadataManagerRef,
+    ) -> Result<(
+        DeserializedValueWithBytes<TableInfoValue>,
+        PhysicalTableRouteValue,
+    )> {
+        let (table_info, table_route) = table_metadata_manager
+            .get_full_table_info(self.physical_table_id)
+            .await?;
+
+        let table_info = table_info.with_context(|| TableInfoNotFoundSnafu {
+            table: format!("table id - {}", self.physical_table_id),
+        })?;
+
+        let physical_table_route = table_route
+            .context(TableRouteNotFoundSnafu {
+                table_id: self.physical_table_id,
+            })?
+            .into_inner();
+
+        let TableRouteValue::Physical(table_route) = physical_table_route else {
+            return AlterLogicalTablesInvalidArgumentsSnafu {
+                err_msg: format!(
+                    "expected a physical table but got a logical table: {:?}",
+                    self.physical_table_id
+                ),
+            }
+            .fail();
+        };
+
+        Ok((table_info, table_route))
+    }
+
+    /// Validates that all logical table routes have the same physical table id.
+    ///
+    /// This method performs the following validations:
+    /// 1. Retrieves table routes for all the given table ids.
+    /// 2. Ensures that all retrieved routes are logical table routes (not physical)
+    /// 3. Verifies that all logical table routes reference the same physical table id.
+    /// 4. Returns an error if any route is not logical or references a different physical table.
+    async fn validate_logical_table_routes(
+        &self,
+        table_route_manager: &TableRouteManager,
+        table_ids: &[TableId],
+    ) -> Result<()> {
+        let all_logical_table_routes_have_same_physical_id =
+            all_logical_table_routes_have_same_physical_id(
+                table_route_manager,
+                table_ids,
+                self.physical_table_id,
+            )
+            .await?;
+
+        ensure!(
+            all_logical_table_routes_have_same_physical_id,
+            AlterLogicalTablesInvalidArgumentsSnafu {
+                err_msg: "All the tasks should have the same physical table id"
+            }
+        );
+
+        Ok(())
+    }
+
+    /// Validates the alter logical expressions.
+    ///
+    /// This method performs the following validations:
+    /// 1. Validates that all alter table expressions have the same schema and catalog.
+    /// 2. Validates that all alter table expressions are of the supported kind.
+    /// 3. Validates that the physical table info and route exist.
+    /// 4. Validates that all logical table routes have the same physical table id.
+    ///
+    /// Returns a [ValidatorResult] containing the validation results.
+    pub async fn validate(
+        &self,
+        table_metadata_manager: &TableMetadataManagerRef,
+    ) -> Result<ValidatorResult> {
+        self.validate_schema()?;
+        self.validate_alter_kind()?;
+        let (physical_table_info, physical_table_route) =
+            self.validate_physical_table(table_metadata_manager).await?;
+        let table_names = self.table_names();
+        let table_ids =
+            get_all_table_ids_by_names(table_metadata_manager.table_name_manager(), &table_names)
+                .await?;
+        let mut table_info_values = get_all_table_info_values_by_table_ids(
+            table_metadata_manager.table_info_manager(),
+            &table_ids,
+            &table_names,
+        )
+        .await?;
+        self.validate_logical_table_routes(
+            table_metadata_manager.table_route_manager(),
+            &table_ids,
+        )
+        .await?;
+        let skip_alter = self
+            .alters
+            .iter()
+            .zip(table_info_values.iter())
+            .map(|(task, table)| skip_alter_logical_region(task, table))
+            .collect::<Vec<_>>();
+        retain_unskipped(&mut table_info_values, &skip_alter);
+        let num_skipped = skip_alter.iter().filter(|&&x| x).count();
+
+        Ok(ValidatorResult {
+            num_skipped,
+            skip_alter,
+            table_info_values,
+            physical_table_info,
+            physical_table_route,
+        })
+    }
+}
+
+/// The result of the validator.
+pub(crate) struct ValidatorResult {
+    pub(crate) num_skipped: usize,
+    pub(crate) skip_alter: Vec<bool>,
+    pub(crate) table_info_values: Vec<DeserializedValueWithBytes<TableInfoValue>>,
+    pub(crate) physical_table_info: DeserializedValueWithBytes<TableInfoValue>,
+    pub(crate) physical_table_route: PhysicalTableRouteValue,
+}
+
+/// Retains the elements that are not skipped.
+pub(crate) fn retain_unskipped<T>(target: &mut Vec<T>, skipped: &[bool]) {
+    debug_assert_eq!(target.len(), skipped.len());
+    let mut iter = skipped.iter();
+    target.retain(|_| !iter.next().unwrap());
+}
+
+/// Returns true if does not required to alter the logical region.
+fn skip_alter_logical_region(alter: &AlterTableExpr, table: &TableInfoValue) -> bool {
+    let existing_columns = table
+        .table_info
+        .meta
+        .schema
+        .column_schemas
+        .iter()
+        .map(|c| &c.name)
+        .collect::<HashSet<_>>();
+
+    let Some(kind) = alter.kind.as_ref() else {
+        return true; // Never get here since we have checked it in `validate_alter_kind`
+    };
+    let Kind::AddColumns(add_columns) = kind else {
+        return true; // Never get here since we have checked it in `validate_alter_kind`
+    };
+
+    // We only check that all columns have been finished. That is to say,
+    // if one part is finished but another part is not, it will be considered
+    // unfinished.
+    add_columns
+        .add_columns
+        .iter()
+        .map(|add_column| add_column.column_def.as_ref().map(|c| &c.name))
+        .all(|column| {
+            column
+                .map(|c| existing_columns.contains(c))
+                .unwrap_or(false)
+        })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_retain_unskipped() {
+        let mut target = vec![1, 2, 3, 4, 5];
+        let skipped = vec![false, true, false, true, false];
+        retain_unskipped(&mut target, &skipped);
+        assert_eq!(target, vec![1, 3, 5]);
+    }
+}
--- a/src/common/meta/src/ddl/alter_table.rs
+++ b/src/common/meta/src/ddl/alter_table.rs
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-mod check;
+mod executor;
 mod metadata;
 mod region_request;
-mod update_metadata;

 use std::vec;

@@ -29,33 +28,29 @@ use common_procedure::{
    Context as ProcedureContext, ContextProvider, Error as ProcedureError, LockKey, PoisonKey,
    PoisonKeys, Procedure, ProcedureId, Status, StringKey,
 };
-use common_telemetry::{debug, error, info, warn};
-use futures::future::{self};
+use common_telemetry::{error, info, warn};
 use serde::{Deserialize, Serialize};
 use snafu::{ensure, ResultExt};
 use store_api::metadata::ColumnMetadata;
 use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY;
-use store_api::storage::RegionId;
 use strum::AsRefStr;
 use table::metadata::{RawTableInfo, TableId, TableInfo};
 use table::table_reference::TableReference;

-use crate::cache_invalidator::Context;
-use crate::ddl::physical_table_metadata::update_table_info_column_ids;
+use crate::ddl::alter_table::executor::AlterTableExecutor;
 use crate::ddl::utils::{
-    add_peer_context_if_needed, extract_column_metadatas, handle_multiple_results,
-    map_to_procedure_error, sync_follower_regions, MultipleResults,
+    extract_column_metadatas, handle_multiple_results, map_to_procedure_error,
+    sync_follower_regions, MultipleResults,
 };
 use crate::ddl::DdlContext;
 use crate::error::{AbortProcedureSnafu, NoLeaderSnafu, PutPoisonSnafu, Result, RetryLaterSnafu};
-use crate::instruction::CacheIdent;
 use crate::key::table_info::TableInfoValue;
 use crate::key::{DeserializedValueWithBytes, RegionDistribution};
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
 use crate::metrics;
 use crate::poison_key::table_poison_key;
 use crate::rpc::ddl::AlterTableTask;
-use crate::rpc::router::{find_leader_regions, find_leaders, region_distribution, RegionRoute};
+use crate::rpc::router::{find_leaders, region_distribution, RegionRoute};

 /// The alter table procedure
 pub struct AlterTableProcedure {
@@ -67,6 +62,24 @@ pub struct AlterTableProcedure {
    /// If we recover the procedure from json, then the table info value is not cached.
    /// But we already validated it in the prepare step.
    new_table_info: Option<TableInfo>,
+    /// The alter table executor.
+    executor: AlterTableExecutor,
+}
+
+/// Builds the executor from the [`AlterTableData`].
+///
+/// # Panics
+/// - If the alter kind is not set.
+fn build_executor_from_alter_expr(alter_data: &AlterTableData) -> AlterTableExecutor {
+    let table_name = alter_data.table_ref().into();
+    let table_id = alter_data.table_id;
+    let alter_kind = alter_data.task.alter_table.kind.as_ref().unwrap();
+    let new_table_name = if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind {
+        Some(new_table_name.to_string())
+    } else {
+        None
+    };
+    AlterTableExecutor::new(table_name, table_id, new_table_name)
 }

 impl AlterTableProcedure {
@@ -74,33 +87,42 @@ impl AlterTableProcedure {

    pub fn new(table_id: TableId, task: AlterTableTask, context: DdlContext) -> Result<Self> {
        task.validate()?;
+        let data = AlterTableData::new(task, table_id);
+        let executor = build_executor_from_alter_expr(&data);
        Ok(Self {
            context,
-            data: AlterTableData::new(task, table_id),
+            data,
            new_table_info: None,
+            executor,
        })
    }

    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
        let data: AlterTableData = serde_json::from_str(json).context(FromJsonSnafu)?;
+        let executor = build_executor_from_alter_expr(&data);
+
        Ok(AlterTableProcedure {
            context,
            data,
            new_table_info: None,
+            executor,
        })
    }

    // Checks whether the table exists.
    pub(crate) async fn on_prepare(&mut self) -> Result<Status> {
-        self.check_alter().await?;
+        self.executor
+            .on_prepare(&self.context.table_metadata_manager)
+            .await?;
        self.fill_table_info().await?;

-        // Validates the request and builds the new table info.
-        // We need to build the new table info here because we should ensure the alteration
-        // is valid in `UpdateMeta` state as we already altered the region.
-        // Safety: `fill_table_info()` already set it.
+        // Safety: filled in `fill_table_info`.
        let table_info_value = self.data.table_info_value.as_ref().unwrap();
-        self.new_table_info = Some(self.build_new_table_info(&table_info_value.table_info)?);
+        let new_table_info = AlterTableExecutor::validate_alter_table_expr(
+            &table_info_value.table_info,
+            self.data.task.alter_table.clone(),
+        )?;
+        self.new_table_info = Some(new_table_info);

        // Safety: Checked in `AlterTableProcedure::new`.
        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
@@ -143,9 +165,7 @@ impl AlterTableProcedure {

        self.data.region_distribution =
            Some(region_distribution(&physical_table_route.region_routes));
-
        let leaders = find_leaders(&physical_table_route.region_routes);
-        let mut alter_region_tasks = Vec::with_capacity(leaders.len());
        let alter_kind = self.make_region_alter_kind()?;

        info!(
@@ -158,31 +178,14 @@ impl AlterTableProcedure {
        ensure!(!leaders.is_empty(), NoLeaderSnafu { table_id });
        // Puts the poison before submitting alter region requests to datanodes.
        self.put_poison(ctx_provider, procedure_id).await?;
-        for datanode in leaders {
-            let requester = self.context.node_manager.datanode(&datanode).await;
-            let regions = find_leader_regions(&physical_table_route.region_routes, &datanode);
-
-            for region in regions {
-                let region_id = RegionId::new(table_id, region);
-                let request = self.make_alter_region_request(region_id, alter_kind.clone())?;
-                debug!("Submitting {request:?} to {datanode}");
-
-                let datanode = datanode.clone();
-                let requester = requester.clone();
-
-                alter_region_tasks.push(async move {
-                    requester
-                        .handle(request)
-                        .await
-                        .map_err(add_peer_context_if_needed(datanode))
-                });
-            }
-        }
-
-        let results = future::join_all(alter_region_tasks)
-            .await
-            .into_iter()
-            .collect::<Vec<_>>();
+        let results = self
+            .executor
+            .on_alter_regions(
+                &self.context.node_manager,
+                &physical_table_route.region_routes,
+                alter_kind,
+            )
+            .await;

        match handle_multiple_results(results) {
            MultipleResults::PartialRetryable(error) => {
@@ -224,7 +227,6 @@ impl AlterTableProcedure {
    }

    fn handle_alter_region_response(&mut self, mut results: Vec<RegionResponse>) -> Result<()> {
-        self.data.state = AlterTableState::UpdateMetadata;
        if let Some(column_metadatas) =
            extract_column_metadatas(&mut results, TABLE_COLUMN_METADATA_EXTENSION_KEY)?
        {
@@ -232,7 +234,7 @@ impl AlterTableProcedure {
        } else {
            warn!("altering table result doesn't contains extension key `{TABLE_COLUMN_METADATA_EXTENSION_KEY}`,leaving the table's column metadata unchanged");
        }
-
+        self.data.state = AlterTableState::UpdateMetadata;
        Ok(())
    }

@@ -260,43 +262,34 @@ impl AlterTableProcedure {
    pub(crate) async fn on_update_metadata(&mut self) -> Result<Status> {
        let table_id = self.data.table_id();
        let table_ref = self.data.table_ref();
-        // Safety: checked before.
+        // Safety: filled in `fill_table_info`.
        let table_info_value = self.data.table_info_value.as_ref().unwrap();
+        // Safety: Checked in `AlterTableProcedure::new`.
+        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
+
        // Gets the table info from the cache or builds it.
-        let new_info = match &self.new_table_info {
+        let  new_info = match &self.new_table_info {
            Some(cached) => cached.clone(),
-            None => self.build_new_table_info(&table_info_value.table_info)
+            None => AlterTableExecutor::validate_alter_table_expr(
+                &table_info_value.table_info,
+                self.data.task.alter_table.clone(),
+               )
                .inspect_err(|e| {
                    // We already check the table info in the prepare step so this should not happen.
                    error!(e; "Unable to build info for table {} in update metadata step, table_id: {}", table_ref, table_id);
                })?,
        };

-        debug!(
-            "Starting update table: {} metadata, new table info {:?}",
-            table_ref.to_string(),
-            new_info
-        );
-
-        // Safety: Checked in `AlterTableProcedure::new`.
-        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
-        if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind {
-            self.on_update_metadata_for_rename(new_table_name.to_string(), table_info_value)
-                .await?;
-        } else {
-            let mut raw_table_info = new_info.into();
-            if !self.data.column_metadatas.is_empty() {
-                update_table_info_column_ids(&mut raw_table_info, &self.data.column_metadatas);
-            }
-            // region distribution is set in submit_alter_region_requests
-            let region_distribution = self.data.region_distribution.as_ref().unwrap().clone();
-            self.on_update_metadata_for_alter(
-                raw_table_info,
-                region_distribution,
+        // Safety: region distribution is set in `submit_alter_region_requests`.
+        self.executor
+            .on_alter_metadata(
+                &self.context.table_metadata_manager,
                table_info_value,
+                self.data.region_distribution.as_ref(),
+                new_info.into(),
+                &self.data.column_metadatas,
            )
            .await?;
-        }

        info!("Updated table metadata for table {table_ref}, table_id: {table_id}, kind: {alter_kind:?}");
        self.data.state = AlterTableState::InvalidateTableCache;
@@ -305,18 +298,9 @@ impl AlterTableProcedure {

    /// Broadcasts the invalidating table cache instructions.
    async fn on_broadcast(&mut self) -> Result<Status> {
-        let cache_invalidator = &self.context.cache_invalidator;
-
-        cache_invalidator
-            .invalidate(
-                &Context::default(),
-                &[
-                    CacheIdent::TableId(self.data.table_id()),
-                    CacheIdent::TableName(self.data.table_ref().into()),
-                ],
-            )
+        self.executor
+            .invalidate_table_cache(&self.context.cache_invalidator)
            .await?;
-
        Ok(Status::done())
    }

--- a/src/common/meta/src/ddl/alter_table/check.rs
+++ b/src/common/meta/src/ddl/alter_table/check.rs
@@ -1,62 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use api::v1::alter_table_expr::Kind;
-use api::v1::RenameTable;
-use common_catalog::format_full_table_name;
-use snafu::ensure;
-
-use crate::ddl::alter_table::AlterTableProcedure;
-use crate::error::{self, Result};
-use crate::key::table_name::TableNameKey;
-
-impl AlterTableProcedure {
-    /// Checks:
-    /// - The new table name doesn't exist (rename).
-    /// - Table exists.
-    pub(crate) async fn check_alter(&self) -> Result<()> {
-        let alter_expr = &self.data.task.alter_table;
-        let catalog = &alter_expr.catalog_name;
-        let schema = &alter_expr.schema_name;
-        let table_name = &alter_expr.table_name;
-        // Safety: Checked in `AlterTableProcedure::new`.
-        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
-
-        let manager = &self.context.table_metadata_manager;
-        if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind {
-            let new_table_name_key = TableNameKey::new(catalog, schema, new_table_name);
-            let exists = manager
-                .table_name_manager()
-                .exists(new_table_name_key)
-                .await?;
-            ensure!(
-                !exists,
-                error::TableAlreadyExistsSnafu {
-                    table_name: format_full_table_name(catalog, schema, new_table_name),
-                }
-            )
-        }
-
-        let table_name_key = TableNameKey::new(catalog, schema, table_name);
-        let exists = manager.table_name_manager().exists(table_name_key).await?;
-        ensure!(
-            exists,
-            error::TableNotFoundSnafu {
-                table_name: format_full_table_name(catalog, schema, &alter_expr.table_name),
-            }
-        );
-
-        Ok(())
-    }
-}
--- a/Show More
+++ b/Show More