feat(standalone): support to dump/restore metadata

feat: introduce MetadataSnaphostManager
feat: enable submitting wal prune procedure periodically (#5867 )
2025-12-22 22:20:02 +00:00 · 2025-04-20 08:13:35 +00:00 · 2025-04-20 06:32:56 +00:00 · 2025-04-18 16:02:33 +00:00 · 2025-04-18 12:10:47 +00:00 · 2025-04-18 11:13:01 +00:00
535 changed files with 19967 additions and 5175 deletions
--- a/.github/actions/setup-greptimedb-cluster/with-remote-wal.yaml
+++ b/.github/actions/setup-greptimedb-cluster/with-remote-wal.yaml
@@ -2,13 +2,13 @@ meta:
  configData: |-
    [runtime]
    global_rt_size = 4
-    
+
    [wal]
    provider = "kafka"
    broker_endpoints = ["kafka.kafka-cluster.svc.cluster.local:9092"]
    num_topics = 3
+    auto_prune_topic_records = true

-        
    [datanode]
    [datanode.client]
    timeout = "120s"
--- a/.github/scripts/create-version.sh
+++ b/.github/scripts/create-version.sh
@@ -25,7 +25,7 @@ function create_version() {
  fi

  # Reuse $NEXT_RELEASE_VERSION to identify whether it's a nightly build.
-  # It will be like 'nigtly-20230808-7d0d8dc6'.
+  # It will be like 'nightly-20230808-7d0d8dc6'.
  if [ "$NEXT_RELEASE_VERSION" = nightly ]; then
    echo "$NIGHTLY_RELEASE_PREFIX-$(date "+%Y%m%d")-$(git rev-parse --short HEAD)"
    exit 0
@@ -60,9 +60,9 @@ function create_version() {
 }

 # You can run as following examples:
-#  GITHUB_EVENT_NAME=push NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nigtly GITHUB_REF_NAME=v0.3.0 ./create-version.sh
-#  GITHUB_EVENT_NAME=workflow_dispatch NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
-#  GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
-#  GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=nightly NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
-#  GITHUB_EVENT_NAME=workflow_dispatch COMMIT_SHA=f0e7216c4bb6acce9b29a21ec2d683be2e3f984a NEXT_RELEASE_VERSION=dev NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
+#  GITHUB_EVENT_NAME=push NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nightly GITHUB_REF_NAME=v0.3.0 ./create-version.sh
+#  GITHUB_EVENT_NAME=workflow_dispatch NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
+#  GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
+#  GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=nightly NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
+#  GITHUB_EVENT_NAME=workflow_dispatch COMMIT_SHA=f0e7216c4bb6acce9b29a21ec2d683be2e3f984a NEXT_RELEASE_VERSION=dev NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
 create_version
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -317,7 +317,7 @@ jobs:
          image-registry-username: ${{ secrets.DOCKERHUB_USERNAME }}
          image-registry-password: ${{ secrets.DOCKERHUB_TOKEN }}
          version: ${{ needs.allocate-runners.outputs.version }}
-          push-latest-tag: true
+          push-latest-tag: ${{ github.ref_type == 'tag' && !contains(github.ref_name, 'nightly') && github.event_name != 'schedule' }}

      - name: Set build image result
        id: set-build-image-result
@@ -364,7 +364,7 @@ jobs:
          dev-mode: false
          upload-to-s3: true
          update-version-info: true
-          push-latest-tag: true
+          push-latest-tag: ${{ github.ref_type == 'tag' && !contains(github.ref_name, 'nightly') && github.event_name != 'schedule' }}

  publish-github-release:
    name: Create GitHub release and upload artifacts
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,7 +77,6 @@ clippy.print_stdout = "warn"
 clippy.print_stderr = "warn"
 clippy.dbg_macro = "warn"
 clippy.implicit_clone = "warn"
-clippy.readonly_write_lock = "allow"
 rust.unknown_lints = "deny"
 rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }

@@ -90,11 +89,11 @@ rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }
 # See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
 ahash = { version = "0.8", features = ["compile-time-rng"] }
 aquamarine = "0.6"
-arrow = { version = "53.0.0", features = ["prettyprint"] }
-arrow-array = { version = "53.0.0", default-features = false, features = ["chrono-tz"] }
-arrow-flight = "53.0"
-arrow-ipc = { version = "53.0.0", default-features = false, features = ["lz4", "zstd"] }
-arrow-schema = { version = "53.0", features = ["serde"] }
+arrow = { version = "54.2", features = ["prettyprint"] }
+arrow-array = { version = "54.2", default-features = false, features = ["chrono-tz"] }
+arrow-flight = "54.2"
+arrow-ipc = { version = "54.2", default-features = false, features = ["lz4", "zstd"] }
+arrow-schema = { version = "54.2", features = ["serde"] }
 async-stream = "0.3"
 async-trait = "0.1"
 # Remember to update axum-extra, axum-macros when updating axum
@@ -113,15 +112,15 @@ clap = { version = "4.4", features = ["derive"] }
 config = "0.13.0"
 crossbeam-utils = "0.8"
 dashmap = "6.1"
-datafusion = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-optimizer = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-sql = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
-datafusion-substrait = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
+datafusion = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-common = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-functions = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-optimizer = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-physical-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-physical-plan = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
 deadpool = "0.12"
 deadpool-postgres = "0.14"
 derive_builder = "0.20"
@@ -130,7 +129,7 @@ etcd-client = "0.14"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "fb8e20ce29afd81835e3ea3c1164c8ce10de2c65" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "b6d9cffd43c4e6358805a798f17e03e232994b82" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -148,6 +147,7 @@ moka = "0.12"
 nalgebra = "0.33"
 notify = "8.0"
 num_cpus = "1.16"
+object_store_opendal = "0.50"
 once_cell = "1.18"
 opentelemetry-proto = { version = "0.27", features = [
    "gen-tonic",
@@ -157,11 +157,11 @@ opentelemetry-proto = { version = "0.27", features = [
    "logs",
 ] }
 parking_lot = "0.12"
-parquet = { version = "53.0.0", default-features = false, features = ["arrow", "async", "object_store"] }
+parquet = { version = "54.2", default-features = false, features = ["arrow", "async", "object_store"] }
 paste = "1.0"
 pin-project = "1.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { version = "0.5", features = ["ser"] }
+promql-parser = { version = "0.5.1", features = ["ser"] }
 prost = "0.13"
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.9"
@@ -181,7 +181,8 @@ rstest = "0.25"
 rstest_reuse = "0.7"
 rust_decimal = "1.33"
 rustc-hash = "2.0"
-rustls = { version = "0.23.20", default-features = false } # override by patch, see [patch.crates-io]
+# It is worth noting that we should try to avoid using aws-lc-rs until it can be compiled on various platforms.
+rustls = { version = "0.23.25", default-features = false }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = { version = "1.0", features = ["float_roundtrip"] }
 serde_with = "3"
@@ -190,23 +191,22 @@ simd-json = "0.15"
 similar-asserts = "1.6.0"
 smallvec = { version = "1", features = ["serde"] }
 snafu = "0.8"
+sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "e98e6b322426a9d397a71efef17075966223c089", features = [
+    "visitor",
+    "serde",
+] } # branch = "v0.54.x"
 sqlx = { version = "0.8", features = [
    "runtime-tokio-rustls",
    "mysql",
    "postgres",
    "chrono",
 ] }
-sysinfo = "0.33"
-# on branch v0.52.x
-sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "71dd86058d2af97b9925093d40c4e03360403170", features = [
-    "visitor",
-    "serde",
-] } # on branch v0.44.x
 strum = { version = "0.27", features = ["derive"] }
+sysinfo = "0.33"
 tempfile = "3"
 tokio = { version = "1.40", features = ["full"] }
 tokio-postgres = "0.7"
-tokio-rustls = { version = "0.26.0", default-features = false } # override by patch, see [patch.crates-io]
+tokio-rustls = { version = "0.26.2", default-features = false }
 tokio-stream = "0.1"
 tokio-util = { version = "0.7", features = ["io-util", "compat"] }
 toml = "0.8.8"
@@ -282,15 +282,6 @@ store-api = { path = "src/store-api" }
 substrait = { path = "src/common/substrait" }
 table = { path = "src/table" }

-[patch.crates-io]
-# change all rustls dependencies to use our fork to default to `ring` to make it "just work"
-hyper-rustls = { git = "https://github.com/GreptimeTeam/hyper-rustls", rev = "a951e03" } # version = "0.27.5" with ring patch
-rustls = { git = "https://github.com/GreptimeTeam/rustls", rev = "34fd0c6" }             # version = "0.23.20" with ring patch
-tokio-rustls = { git = "https://github.com/GreptimeTeam/tokio-rustls", rev = "4604ca6" } # version = "0.26.0" with ring patch
-# This is commented, since we are not using aws-lc-sys, if we need to use it, we need to uncomment this line or use a release after this commit, or it wouldn't compile with gcc < 8.1
-# see https://github.com/aws/aws-lc-rs/pull/526
-# aws-lc-sys = { git ="https://github.com/aws/aws-lc-rs", rev = "556558441e3494af4b156ae95ebc07ebc2fd38aa" }
-
 [workspace.dependencies.meter-macros]
 git = "https://github.com/GreptimeTeam/greptime-meter.git"
 rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"
--- a/5
+++ b/5
@@ -32,6 +32,10 @@ ifneq ($(strip $(BUILD_JOBS)),)
 	NEXTEST_OPTS += --build-jobs=${BUILD_JOBS}
 endif

+ifneq ($(strip $(BUILD_JOBS)),)
+	SQLNESS_OPTS += --jobs ${BUILD_JOBS}
+endif
+
 ifneq ($(strip $(CARGO_PROFILE)),)
 	CARGO_BUILD_OPTS += --profile ${CARGO_PROFILE}
 endif
@@ -193,6 +197,7 @@ fix-clippy: ## Fix clippy violations.
 fmt-check: ## Check code format.
 	cargo fmt --all -- --check
 	python3 scripts/check-snafu.py
+	python3 scripts/check-super-imports.py

 .PHONY: start-etcd
 start-etcd: ## Start single node etcd for testing purpose.
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
  </picture>
 </p>

-<h2 align="center">Unified & Cost-Effective Observerability Database for Metrics, Logs, and Events</h2>
+<h2 align="center">Unified & Cost-Effective Observability  Database for Metrics, Logs, and Events</h2>

 <div align="center">
 <h3 align="center">
@@ -62,35 +62,35 @@

 ## Introduction

-**GreptimeDB** is an open-source unified & cost-effective observerability database for **Metrics**, **Logs**, and **Events** (also **Traces** in plan). You can gain real-time insights from Edge to Cloud at Any Scale.
+**GreptimeDB** is an open-source, cloud-native, unified & cost-effective observability database for **Metrics**, **Logs**, and **Traces**. You can gain real-time insights from Edge to Cloud at Any Scale.

 ## News

-**[GreptimeDB archives 1 billion cold run #1 in JSONBench!](https://greptime.com/blogs/2025-03-18-jsonbench-greptimedb-performance)**
+**[GreptimeDB tops JSONBench's billion-record cold run test!](https://greptime.com/blogs/2025-03-18-jsonbench-greptimedb-performance)**

 ## Why GreptimeDB

-Our core developers have been building observerability data platforms for years. Based on our best practices, GreptimeDB was born to give you:
+Our core developers have been building observability data platforms for years. Based on our best practices, GreptimeDB was born to give you:

-* **Unified Processing of Metrics, Logs, and Events**
+* **Unified Processing of Observability Data**

-  GreptimeDB unifies observerability data processing by treating all data - whether metrics, logs, or events - as timestamped events with context. Users can analyze this data using either [SQL](https://docs.greptime.com/user-guide/query-data/sql) or [PromQL](https://docs.greptime.com/user-guide/query-data/promql) and leverage stream processing ([Flow](https://docs.greptime.com/user-guide/flow-computation/overview)) to enable continuous aggregation. [Read more](https://docs.greptime.com/user-guide/concepts/data-model).
+  A unified database that treats metrics, logs, and traces as timestamped wide events with context, supporting [SQL](https://docs.greptime.com/user-guide/query-data/sql)/[PromQL](https://docs.greptime.com/user-guide/query-data/promql) queries and [stream processing](https://docs.greptime.com/user-guide/flow-computation/overview) to simplify complex data stacks.
+
+* **High Performance and Cost-effective**
+
+   Written in Rust, combines a distributed query engine with [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index) (inverted, fulltext, skip data, and vector) and optimized columnar storage to deliver sub-second responses on petabyte-scale data and high-cost efficiency.

 * **Cloud-native Distributed Database**

  Built for [Kubernetes](https://docs.greptime.com/user-guide/deployments/deploy-on-kubernetes/greptimedb-operator-management). GreptimeDB achieves seamless scalability with its [cloud-native architecture](https://docs.greptime.com/user-guide/concepts/architecture) of separated compute and storage, built on object storage (AWS S3, Azure Blob Storage, etc.) while enabling cross-cloud deployment through a unified data access layer.

-* **Performance and Cost-effective**
+* **Developer-Friendly**

-  Written in pure Rust for superior performance and reliability. GreptimeDB features a distributed query engine with intelligent indexing to handle high cardinality data efficiently. Its optimized columnar storage achieves 50x cost efficiency on cloud object storage through advanced compression. [Benchmark reports](https://www.greptime.com/blogs/2024-09-09-report-summary).
+  Access standardized SQL/PromQL interfaces through built-in web dashboard, REST API, and MySQL/PostgreSQL protocols. Supports widely adopted data ingestion [protocols](https://docs.greptime.com/user-guide/protocols/overview) for seamless migration and integration.

-* **Cloud-Edge Collaboration**
+* **Flexible Deployment Options**

-  GreptimeDB seamlessly operates across cloud and edge (ARM/Android/Linux), providing consistent APIs and control plane for unified data management and efficient synchronization. [Learn how to run on Android](https://docs.greptime.com/user-guide/deployments/run-on-android/).
-
-* **Multi-protocol Ingestion, SQL & PromQL Ready**
-
-  Widely adopted database protocols and APIs, including MySQL, PostgreSQL, InfluxDB, OpenTelemetry, Loki and Prometheus, etc.  Effortless Adoption & Seamless Migration. [Supported Protocols Overview](https://docs.greptime.com/user-guide/protocols/overview).
+  Deploy GreptimeDB anywhere from ARM-based edge devices to cloud environments with unified APIs and bandwidth-efficient data synchronization. Query edge and cloud data seamlessly through identical APIs. [Learn how to run on Android](https://docs.greptime.com/user-guide/deployments/run-on-android/).

 For more detailed info please read  [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb).

@@ -233,3 +233,5 @@ Special thanks to all the contributors who have propelled GreptimeDB forward. Fo
 - GreptimeDB's query engine is powered by [Apache Arrow DataFusion™](https://arrow.apache.org/datafusion/).
 - [Apache OpenDAL™](https://opendal.apache.org) gives GreptimeDB a very general and elegant data access abstraction layer.
 - GreptimeDB's meta service is based on [etcd](https://etcd.io/).
+
+<img alt="Known Users" src="https://greptime.com/logo/img/users.png"/>
--- a/config/config.md
+++ b/config/config.md
@@ -85,10 +85,6 @@
 | `wal.create_topic_timeout` | String | `30s` | Above which a topic creation operation will be cancelled.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.max_batch_bytes` | String | `1MB` | The max size of a single producer batch.<br/>Warning: Kafka has a default limit of 1MB per message in a topic.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.consumer_wait_timeout` | String | `100ms` | The consumer wait timeout.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_init` | String | `500ms` | The initial backoff delay.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_max` | String | `10s` | The maximum backoff delay.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_base` | Integer | `2` | The exponential backoff rate, i.e. next backoff = base * current backoff.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_deadline` | String | `5mins` | The deadline of retries.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system<br/>can still successfully replay memtable data without throwing an<br/>out-of-range error.<br/>However, enabling this option might lead to unexpected data loss,<br/>as the system will skip over missing entries instead of treating<br/>them as critical errors. |
 | `metadata_store` | -- | -- | Metadata storage options. |
 | `metadata_store.file_size` | String | `64MB` | The size of the metadata store log file. |
@@ -100,6 +96,8 @@
 | `procedure.max_running_procedures` | Integer | `128` | Max running procedures.<br/>The maximum number of procedures that can be running at the same time.<br/>If the number of running procedures exceeds this limit, the procedure will be rejected. |
 | `flow` | -- | -- | flow engine options. |
 | `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
+| `query` | -- | -- | The query engine options. |
+| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data/` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
@@ -274,6 +272,8 @@
 | `meta_client.metadata_cache_max_capacity` | Integer | `100000` | The configuration about the cache of the metadata. |
 | `meta_client.metadata_cache_ttl` | String | `10m` | TTL of the metadata cache. |
 | `meta_client.metadata_cache_tti` | String | `5m` | -- |
+| `query` | -- | -- | The query engine options. |
+| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
 | `datanode` | -- | -- | Datanode options. |
 | `datanode.client` | -- | -- | Datanode client options. |
 | `datanode.client.connect_timeout` | String | `10s` | -- |
@@ -343,15 +343,14 @@
 | `wal.provider` | String | `raft_engine` | -- |
 | `wal.broker_endpoints` | Array | -- | The broker endpoints of the Kafka cluster. |
 | `wal.auto_create_topics` | Bool | `true` | Automatically create topics for WAL.<br/>Set to `true` to automatically create topics for WAL.<br/>Otherwise, use topics named `topic_name_prefix_[0..num_topics)` |
+| `wal.auto_prune_interval` | String | `0s` | Interval of automatically WAL pruning.<br/>Set to `0s` to disable automatically WAL pruning which delete unused remote WAL entries periodically. |
+| `wal.trigger_flush_threshold` | Integer | `0` | The threshold to trigger a flush operation of a region in automatically WAL pruning.<br/>Metasrv will send a flush request to flush the region when:<br/>`trigger_flush_threshold` + `prunable_entry_id` < `max_prunable_entry_id`<br/>where:<br/>- `prunable_entry_id` is the maximum entry id that can be pruned of the region.<br/>- `max_prunable_entry_id` is the maximum prunable entry id among all regions in the same topic.<br/>Set to `0` to disable the flush operation. |
+| `wal.auto_prune_parallelism` | Integer | `10` | Concurrent task limit for automatically WAL pruning. |
 | `wal.num_topics` | Integer | `64` | Number of topics. |
 | `wal.selector_type` | String | `round_robin` | Topic selector type.<br/>Available selector types:<br/>- `round_robin` (default) |
 | `wal.topic_name_prefix` | String | `greptimedb_wal_topic` | A Kafka topic is constructed by concatenating `topic_name_prefix` and `topic_id`.<br/>Only accepts strings that match the following regular expression pattern:<br/>[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*<br/>i.g., greptimedb_wal_topic_0, greptimedb_wal_topic_1. |
 | `wal.replication_factor` | Integer | `1` | Expected number of replicas of each partition. |
 | `wal.create_topic_timeout` | String | `30s` | Above which a topic creation operation will be cancelled. |
-| `wal.backoff_init` | String | `500ms` | The initial backoff for kafka clients. |
-| `wal.backoff_max` | String | `10s` | The maximum backoff for kafka clients. |
-| `wal.backoff_base` | Integer | `2` | Exponential backoff rate, i.e. next backoff = base * current backoff. |
-| `wal.backoff_deadline` | String | `5mins` | Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate. |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
 | `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
@@ -434,13 +433,11 @@
 | `wal.broker_endpoints` | Array | -- | The Kafka broker endpoints.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.max_batch_bytes` | String | `1MB` | The max size of a single producer batch.<br/>Warning: Kafka has a default limit of 1MB per message in a topic.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.consumer_wait_timeout` | String | `100ms` | The consumer wait timeout.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_init` | String | `500ms` | The initial backoff delay.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_max` | String | `10s` | The maximum backoff delay.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_base` | Integer | `2` | The exponential backoff rate, i.e. next backoff = base * current backoff.<br/>**It's only used when the provider is `kafka`**. |
-| `wal.backoff_deadline` | String | `5mins` | The deadline of retries.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.create_index` | Bool | `true` | Whether to enable WAL index creation.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.dump_index_interval` | String | `60s` | The interval for dumping WAL indexes.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system<br/>can still successfully replay memtable data without throwing an<br/>out-of-range error.<br/>However, enabling this option might lead to unexpected data loss,<br/>as the system will skip over missing entries instead of treating<br/>them as critical errors. |
+| `query` | -- | -- | The query engine options. |
+| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data/` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -166,22 +166,6 @@ max_batch_bytes = "1MB"
 ## **It's only used when the provider is `kafka`**.
 consumer_wait_timeout = "100ms"

-## The initial backoff delay.
-## **It's only used when the provider is `kafka`**.
-backoff_init = "500ms"
-
-## The maximum backoff delay.
-## **It's only used when the provider is `kafka`**.
-backoff_max = "10s"
-
-## The exponential backoff rate, i.e. next backoff = base * current backoff.
-## **It's only used when the provider is `kafka`**.
-backoff_base = 2
-
-## The deadline of retries.
-## **It's only used when the provider is `kafka`**.
-backoff_deadline = "5mins"
-
 ## Whether to enable WAL index creation.
 ## **It's only used when the provider is `kafka`**.
 create_index = true
@@ -259,6 +243,12 @@ overwrite_entry_start_id = false
 # credential = "base64-credential"
 # endpoint = "https://storage.googleapis.com"

+## The query engine options.
+[query]
+## Parallelism of the query engine.
+## Default to 0, which means the number of CPU cores.
+parallelism = 0
+
 ## The data storage options.
 [storage]
 ## The working home directory.
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -179,6 +179,12 @@ metadata_cache_ttl = "10m"
 # TTI of the metadata cache.
 metadata_cache_tti = "5m"

+## The query engine options.
+[query]
+## Parallelism of the query engine.
+## Default to 0, which means the number of CPU cores.
+parallelism = 0
+
 ## Datanode options.
 [datanode]
 ## Datanode client options.
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -130,6 +130,22 @@ broker_endpoints = ["127.0.0.1:9092"]
 ## Otherwise, use topics named `topic_name_prefix_[0..num_topics)`
 auto_create_topics = true

+## Interval of automatically WAL pruning.
+## Set to `0s` to disable automatically WAL pruning which delete unused remote WAL entries periodically.
+auto_prune_interval = "0s"
+
+## The threshold to trigger a flush operation of a region in automatically WAL pruning.
+## Metasrv will send a flush request to flush the region when:
+## `trigger_flush_threshold` + `prunable_entry_id` < `max_prunable_entry_id`
+## where:
+## - `prunable_entry_id` is the maximum entry id that can be pruned of the region.
+## - `max_prunable_entry_id` is the maximum prunable entry id among all regions in the same topic.
+## Set to `0` to disable the flush operation.
+trigger_flush_threshold = 0
+
+## Concurrent task limit for automatically WAL pruning.
+auto_prune_parallelism = 10
+
 ## Number of topics.
 num_topics = 64

@@ -149,17 +165,6 @@ replication_factor = 1

 ## Above which a topic creation operation will be cancelled.
 create_topic_timeout = "30s"
-## The initial backoff for kafka clients.
-backoff_init = "500ms"
-
-## The maximum backoff for kafka clients.
-backoff_max = "10s"
-
-## Exponential backoff rate, i.e. next backoff = base * current backoff.
-backoff_base = 2
-
-## Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate.
-backoff_deadline = "5mins"

 # The Kafka SASL configuration.
 # **It's only used when the provider is `kafka`**.
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -239,22 +239,6 @@ max_batch_bytes = "1MB"
 ## **It's only used when the provider is `kafka`**.
 consumer_wait_timeout = "100ms"

-## The initial backoff delay.
-## **It's only used when the provider is `kafka`**.
-backoff_init = "500ms"
-
-## The maximum backoff delay.
-## **It's only used when the provider is `kafka`**.
-backoff_max = "10s"
-
-## The exponential backoff rate, i.e. next backoff = base * current backoff.
-## **It's only used when the provider is `kafka`**.
-backoff_base = 2
-
-## The deadline of retries.
-## **It's only used when the provider is `kafka`**.
-backoff_deadline = "5mins"
-
 ## Ignore missing entries during read WAL.
 ## **It's only used when the provider is `kafka`**.
 ##
@@ -350,6 +334,12 @@ max_running_procedures = 128
 # credential = "base64-credential"
 # endpoint = "https://storage.googleapis.com"

+## The query engine options.
+[query]
+## Parallelism of the query engine.
+## Default to 0, which means the number of CPU cores.
+parallelism = 0
+
 ## The data storage options.
 [storage]
 ## The working home directory.
--- a/docs/how-to/how-to-profile-memory.md
+++ b/docs/how-to/how-to-profile-memory.md
@@ -1,6 +1,6 @@
 # Profile memory usage of GreptimeDB

-This crate provides an easy approach to dump memory profiling info.
+This crate provides an easy approach to dump memory profiling info. A set of ready to use scripts is provided in [docs/how-to/memory-profile-scripts](docs/how-to/memory-profile-scripts).

 ## Prerequisites
 ### jemalloc
--- a/docs/how-to/memory-profile-scripts/scripts/README.md
+++ b/docs/how-to/memory-profile-scripts/scripts/README.md
@@ -0,0 +1,52 @@
+# Memory Analysis Process
+This section will guide you through the process of analyzing memory usage for greptimedb.
+
+1. Get the `jeprof` tool script, see the next section("Getting the `jeprof` tool") for details.
+
+2. After starting `greptimedb`(with env var `MALLOC_CONF=prof:true`), execute the `dump.sh` script with the PID of the `greptimedb` process as an argument. This continuously monitors memory usage and captures profiles when exceeding thresholds (e.g. +20MB within 10 minutes). Outputs `greptime-{timestamp}.gprof` files.
+
+3. With 2-3 gprof files, run `gen_flamegraph.sh` in the same environment to generate flame graphs showing memory allocation call stacks.
+
+4.  **NOTE:** The `gen_flamegraph.sh` script requires `jeprof` and optionally `flamegraph.pl` to be in the current directory. If needed to gen flamegraph now, run the `get_flamegraph_tool.sh` script, which downloads the flame graph generation tool `flamegraph.pl` to the current directory.
+    The usage of `gen_flamegraph.sh` is:
+
+    `Usage: ./gen_flamegraph.sh <binary_path> <gprof_directory>`
+    where `<binary_path>` is the path to the greptimedb binary, `<gprof_directory>` is the directory containing the gprof files(the directory `dump.sh` is dumping profiles to).
+    Example call: `./gen_flamegraph.sh ./greptime .`
+
+    Generating the flame graph might take a few minutes. The generated flame graphs are located in the `<gprof_directory>/flamegraphs` directory. Or if no `flamegraph.pl` is found, it will only contain `.collapse` files which is also fine.
+5.  You can send the generated flame graphs(the entire folder of `<gprof_directory>/flamegraphs`) to developers for further analysis.
+
+
+## Getting the `jeprof` tool
+there are three ways to get `jeprof`, list in here from simple to complex, using any one of those methods is ok, as long as it's the same environment as the `greptimedb` will be running on:
+1. If you are compiling greptimedb from source, then `jeprof` is already produced during compilation. After running `cargo build`, execute `find_compiled_jeprof.sh`. This will copy `jeprof` to the current directory.
+2. Or, if you have the Rust toolchain installed locally, simply follow these commands:
+```bash
+cargo new get_jeprof
+cd get_jeprof
+```
+Then add this line to `Cargo.toml`:
+```toml
+[dependencies]
+tikv-jemalloc-ctl = { version = "0.6", features = ["use_std", "stats"] }
+```
+then run:
+```bash
+cargo build
+```
+after that the `jeprof` tool is produced. Now run `find_compiled_jeprof.sh` in current directory, it will copy the `jeprof` tool to the current directory.
+
+3. compile jemalloc from source
+you can first clone this repo, and checkout to this commit:
+```bash
+git clone https://github.com/tikv/jemalloc.git
+cd jemalloc
+git checkout e13ca993e8ccb9ba9847cc330696e02839f328f7
+```
+then run:
+```bash
+./configure
+make
+```
+and `jeprof` is in `.bin/` directory. Copy it to the current directory.
--- a/docs/how-to/memory-profile-scripts/scripts/dump.sh
+++ b/docs/how-to/memory-profile-scripts/scripts/dump.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Monitors greptime process memory usage every 10 minutes
+# Triggers memory profile capture via `curl -X POST localhost:4000/debug/prof/mem > greptime-{timestamp}.gprof`
+# when memory increases by more than 20MB since last check
+# Generated profiles can be analyzed using flame graphs as described in `how-to-profile-memory.md`
+# (jeprof is compiled with the database - see documentation)
+# Alternative: Share binaries + profiles for analysis (Docker images preferred)
+
+# Threshold in Kilobytes (20 MB)
+threshold_kb=$((20 * 1024))
+sleep_interval=$((10 * 60))
+
+# Variable to store the last measured memory usage in KB
+last_mem_kb=0
+
+echo "Starting memory monitoring for 'greptime' process..."
+
+while true; do
+
+    # Check if PID is provided as an argument
+    if [ -z "$1" ]; then
+        echo "$(date): PID must be provided as a command-line argument."
+        exit 1
+    fi
+
+    pid="$1"
+
+    # Validate that the PID is a number
+    if ! [[ "$pid" =~ ^[0-9]+$ ]]; then
+        echo "$(date): Invalid PID: '$pid'. PID must be a number."
+        exit 1
+    fi
+
+    # Get the current Resident Set Size (RSS) in Kilobytes
+    current_mem_kb=$(ps -o rss= -p "$pid")
+
+    # Check if ps command was successful and returned a number
+    if ! [[ "$current_mem_kb" =~ ^[0-9]+$ ]]; then
+        echo "$(date): Failed to get memory usage for PID $pid. Skipping check."
+        # Keep last_mem_kb to avoid false positives if the process briefly becomes unreadable.
+        continue
+    fi
+
+    echo "$(date): Current memory usage for PID $pid: ${current_mem_kb} KB"
+
+    # Compare with the last measurement
+    # if it's the first run, also do a baseline dump just to make sure we can dump
+    
+    diff_kb=$((current_mem_kb - last_mem_kb))
+    echo "$(date): Memory usage change since last check: ${diff_kb} KB"
+
+    if [ "$diff_kb" -gt "$threshold_kb" ]; then
+        echo "$(date): Memory increase (${diff_kb} KB) exceeded threshold (${threshold_kb} KB). Dumping profile..."
+        timestamp=$(date +%Y%m%d%H%M%S)
+        profile_file="greptime-${timestamp}.gprof"
+        # Execute curl and capture output to file
+        if curl -sf -X POST localhost:4000/debug/prof/mem > "$profile_file"; then
+            echo "$(date): Memory profile saved to $profile_file"
+        else
+            echo "$(date): Failed to dump memory profile (curl exit code: $?)."
+            # Remove the potentially empty/failed profile file
+            rm -f "$profile_file"
+        fi
+    else
+            echo "$(date): Memory increase (${diff_kb} KB) is within the threshold (${threshold_kb} KB)."
+    fi
+    
+
+    # Update the last memory usage
+    last_mem_kb=$current_mem_kb
+    
+    # Wait for 5 minutes
+    echo "$(date): Sleeping for $sleep_interval seconds..."
+    sleep $sleep_interval
+done
+
+echo "Memory monitoring script stopped." # This line might not be reached in normal operation
--- a/docs/how-to/memory-profile-scripts/scripts/find_compiled_jeprof.sh
+++ b/docs/how-to/memory-profile-scripts/scripts/find_compiled_jeprof.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Locates compiled jeprof binary (memory analysis tool) after cargo build
+# Copies it to current directory from target/ build directories
+
+JPROF_PATH=$(find . -name 'jeprof' -print -quit)
+if [ -n "$JPROF_PATH" ]; then
+  echo "Found jeprof at $JPROF_PATH"
+  cp "$JPROF_PATH" .
+  chmod +x jeprof
+  echo "Copied jeprof to current directory and made it executable."
+else
+  echo "jeprof not found"
+  exit 1
+fi
--- a/docs/how-to/memory-profile-scripts/scripts/gen_flamegraph.sh
+++ b/docs/how-to/memory-profile-scripts/scripts/gen_flamegraph.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Generate flame graphs from a series of `.gprof` files
+# First argument: Path to the binary executable
+# Second argument: Path to directory containing gprof files
+# Requires `jeprof` and `flamegraph.pl` in current directory
+# What this script essentially does is:
+# ./jeprof <binary> <gprof> --collapse | ./flamegraph.pl > <output>
+# For differential analysis between consecutive profiles:
+# ./jeprof <binary> --base <gprof1> <gprof2> --collapse | ./flamegraph.pl > <output_diff>
+
+set -e # Exit immediately if a command exits with a non-zero status.
+
+# Check for required tools
+if [ ! -f "./jeprof" ]; then
+    echo "Error: jeprof not found in the current directory."
+    exit 1
+fi
+
+if [ ! -f "./flamegraph.pl" ]; then
+    echo "Error: flamegraph.pl not found in the current directory."
+    exit 1
+fi
+
+# Check arguments
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <binary_path> <gprof_directory>"
+    exit 1
+fi
+
+BINARY_PATH=$1
+GPROF_DIR=$2
+OUTPUT_DIR="${GPROF_DIR}/flamegraphs" # Store outputs in a subdirectory
+
+if [ ! -f "$BINARY_PATH" ]; then
+    echo "Error: Binary file not found at $BINARY_PATH"
+    exit 1
+fi
+
+if [ ! -d "$GPROF_DIR" ]; then
+    echo "Error: gprof directory not found at $GPROF_DIR"
+    exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR"
+echo "Generating flamegraphs in $OUTPUT_DIR"
+
+# Find and sort gprof files
+# Use find + sort -V for natural sort of version numbers if present in filenames
+# Use null-terminated strings for safety with find/xargs/sort
+mapfile -d $'\0' gprof_files < <(find "$GPROF_DIR" -maxdepth 1 -name '*.gprof' -print0 | sort -zV)
+
+if [ ${#gprof_files[@]} -eq 0 ]; then
+    echo "No .gprof files found in $GPROF_DIR"
+    exit 0
+fi
+
+prev_gprof=""
+
+# Generate flamegraphs
+for gprof_file in "${gprof_files[@]}"; do
+    # Skip empty entries if any
+    if [ -z "$gprof_file" ]; then
+        continue
+    fi
+
+    filename=$(basename "$gprof_file" .gprof)
+    output_collapse="${OUTPUT_DIR}/${filename}.collapse"
+    output_svg="${OUTPUT_DIR}/${filename}.svg"
+    echo "Generating collapse file for $gprof_file -> $output_collapse"
+    ./jeprof "$BINARY_PATH" "$gprof_file" --collapse > "$output_collapse"
+    echo "Generating flamegraph for $gprof_file -> $output_svg"
+    ./flamegraph.pl "$output_collapse" > "$output_svg" || true
+
+    # Generate diff flamegraph if not the first file
+    if [ -n "$prev_gprof" ]; then
+        prev_filename=$(basename "$prev_gprof" .gprof)
+        diff_output_collapse="${OUTPUT_DIR}/${prev_filename}_vs_${filename}_diff.collapse"
+        diff_output_svg="${OUTPUT_DIR}/${prev_filename}_vs_${filename}_diff.svg"
+        echo "Generating diff collapse file for $prev_gprof vs $gprof_file -> $diff_output_collapse"
+        ./jeprof "$BINARY_PATH" --base "$prev_gprof" "$gprof_file" --collapse > "$diff_output_collapse"
+        echo "Generating diff flamegraph for $prev_gprof vs $gprof_file -> $diff_output_svg"
+        ./flamegraph.pl "$diff_output_collapse" > "$diff_output_svg" || true
+    fi
+
+    prev_gprof="$gprof_file"
+done
+
+echo "Flamegraph generation complete."
--- a/docs/how-to/memory-profile-scripts/scripts/gen_from_collapse.sh
+++ b/docs/how-to/memory-profile-scripts/scripts/gen_from_collapse.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Generate flame graphs from .collapse files
+# Argument: Path to directory containing collapse files
+# Requires `flamegraph.pl` in current directory
+
+# Check if flamegraph.pl exists
+if [ ! -f "./flamegraph.pl" ]; then
+    echo "Error: flamegraph.pl not found in the current directory."
+    exit 1
+fi
+
+# Check if directory argument is provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 <collapse_directory>"
+    exit 1
+fi
+
+COLLAPSE_DIR=$1
+
+# Check if the provided argument is a directory
+if [ ! -d "$COLLAPSE_DIR" ]; then
+    echo "Error: '$COLLAPSE_DIR' is not a valid directory."
+    exit 1
+fi
+
+echo "Generating flame graphs from collapse files in '$COLLAPSE_DIR'..."
+
+# Find and process each .collapse file
+find "$COLLAPSE_DIR" -maxdepth 1 -name "*.collapse" -print0 | while IFS= read -r -d $'\0' collapse_file; do
+    if [ -f "$collapse_file" ]; then
+        # Construct the output SVG filename
+        svg_file="${collapse_file%.collapse}.svg"
+        echo "Generating $svg_file from $collapse_file..."
+        ./flamegraph.pl "$collapse_file" > "$svg_file"
+        if [ $? -ne 0 ]; then
+            echo "Error generating flame graph for $collapse_file"
+        else
+            echo "Successfully generated $svg_file"
+        fi
+    fi
+done
+
+echo "Flame graph generation complete."
--- a/docs/how-to/memory-profile-scripts/scripts/get_flamegraph_tool.sh
+++ b/docs/how-to/memory-profile-scripts/scripts/get_flamegraph_tool.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Download flamegraph.pl to current directory - this is the flame graph generation tool script
+
+curl https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl > ./flamegraph.pl
+chmod +x ./flamegraph.pl
--- a/scripts/check-super-imports.py
+++ b/scripts/check-super-imports.py
@@ -0,0 +1,74 @@
+# Copyright 2023 Greptime Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from multiprocessing import Pool
+
+
+def find_rust_files(directory):
+    rust_files = []
+    for root, _, files in os.walk(directory):
+        # Skip files with "test" in the path
+        if "test" in root.lower():
+            continue
+
+        for file in files:
+            # Skip files with "test" in the filename
+            if "test" in file.lower():
+                continue
+
+            if file.endswith(".rs"):
+                rust_files.append(os.path.join(root, file))
+    return rust_files
+
+
+def check_file_for_super_import(file_path):
+    with open(file_path, "r") as file:
+        lines = file.readlines()
+
+    violations = []
+    for line_number, line in enumerate(lines, 1):
+        # Check for "use super::" without leading tab
+        if line.startswith("use super::"):
+            violations.append((line_number, line.strip()))
+
+    if violations:
+        return file_path, violations
+    return None
+
+
+def main():
+    rust_files = find_rust_files(".")
+
+    with Pool() as pool:
+        results = pool.map(check_file_for_super_import, rust_files)
+
+    # Filter out None results
+    violations = [result for result in results if result]
+
+    if violations:
+        print("Found 'use super::' without leading tab in the following files:")
+        counter = 1
+        for file_path, file_violations in violations:
+            for line_number, line in file_violations:
+                print(f"{counter:>5} {file_path}:{line_number} - {line}")
+                counter += 1
+        raise SystemExit(1)
+    else:
+        print("No 'use super::' without leading tab found. All files are compliant.")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -49,7 +49,6 @@ pub use table_names::*;
 use views::InformationSchemaViews;

 use self::columns::InformationSchemaColumns;
-use super::{SystemSchemaProviderInner, SystemTable, SystemTableRef};
 use crate::error::{Error, Result};
 use crate::system_schema::information_schema::cluster_info::InformationSchemaClusterInfo;
 use crate::system_schema::information_schema::flows::InformationSchemaFlows;
@@ -63,7 +62,9 @@ use crate::system_schema::information_schema::table_constraints::InformationSche
 use crate::system_schema::information_schema::tables::InformationSchemaTables;
 use crate::system_schema::memory_table::MemoryTable;
 pub(crate) use crate::system_schema::predicate::Predicates;
-use crate::system_schema::SystemSchemaProvider;
+use crate::system_schema::{
+    SystemSchemaProvider, SystemSchemaProviderInner, SystemTable, SystemTableRef,
+};
 use crate::CatalogManager;

 lazy_static! {
--- a/src/catalog/src/system_schema/information_schema/cluster_info.rs
+++ b/src/catalog/src/system_schema/information_schema/cluster_info.rs
@@ -36,9 +36,8 @@ use datatypes::vectors::{
 use snafu::ResultExt;
 use store_api::storage::{ScanRequest, TableId};

-use super::CLUSTER_INFO;
 use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, CLUSTER_INFO};
 use crate::system_schema::utils;
 use crate::CatalogManager;

--- a/src/catalog/src/system_schema/information_schema/columns.rs
+++ b/src/catalog/src/system_schema/information_schema/columns.rs
@@ -38,11 +38,11 @@ use snafu::{OptionExt, ResultExt};
 use sql::statements;
 use store_api::storage::{ScanRequest, TableId};

-use super::{InformationTable, COLUMNS};
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
 use crate::information_schema::Predicates;
+use crate::system_schema::information_schema::{InformationTable, COLUMNS};
 use crate::CatalogManager;

 #[derive(Debug)]
--- a/src/catalog/src/system_schema/information_schema/information_memory_table.rs
+++ b/src/catalog/src/system_schema/information_schema/information_memory_table.rs
@@ -18,7 +18,7 @@ use common_catalog::consts::{METRIC_ENGINE, MITO_ENGINE};
 use datatypes::schema::{Schema, SchemaRef};
 use datatypes::vectors::{Int64Vector, StringVector, VectorRef};

-use super::table_names::*;
+use crate::system_schema::information_schema::table_names::*;
 use crate::system_schema::utils::tables::{
    bigint_column, string_column, string_columns, timestamp_micro_column,
 };
--- a/src/catalog/src/system_schema/information_schema/key_column_usage.rs
+++ b/src/catalog/src/system_schema/information_schema/key_column_usage.rs
@@ -24,18 +24,17 @@ use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatch
 use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
 use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
 use datatypes::prelude::{ConcreteDataType, MutableVector, ScalarVectorBuilder, VectorRef};
-use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
+use datatypes::schema::{ColumnSchema, FulltextBackend, Schema, SchemaRef};
 use datatypes::value::Value;
 use datatypes::vectors::{ConstantVector, StringVector, StringVectorBuilder, UInt32VectorBuilder};
 use futures_util::TryStreamExt;
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::{ScanRequest, TableId};

-use super::KEY_COLUMN_USAGE;
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, KEY_COLUMN_USAGE};
 use crate::CatalogManager;

 pub const CONSTRAINT_SCHEMA: &str = "constraint_schema";
@@ -48,20 +47,38 @@ pub const TABLE_SCHEMA: &str = "table_schema";
 pub const TABLE_NAME: &str = "table_name";
 pub const COLUMN_NAME: &str = "column_name";
 pub const ORDINAL_POSITION: &str = "ordinal_position";
+/// The type of the index.
+pub const GREPTIME_INDEX_TYPE: &str = "greptime_index_type";
 const INIT_CAPACITY: usize = 42;

-/// Primary key constraint name
-pub(crate) const PRI_CONSTRAINT_NAME: &str = "PRIMARY";
 /// Time index constraint name
-pub(crate) const TIME_INDEX_CONSTRAINT_NAME: &str = "TIME INDEX";
+pub(crate) const CONSTRAINT_NAME_TIME_INDEX: &str = "TIME INDEX";
+
+/// Primary key constraint name
+pub(crate) const CONSTRAINT_NAME_PRI: &str = "PRIMARY";
+/// Primary key index type
+pub(crate) const INDEX_TYPE_PRI: &str = "greptime-primary-key-v1";
+
 /// Inverted index constraint name
-pub(crate) const INVERTED_INDEX_CONSTRAINT_NAME: &str = "INVERTED INDEX";
+pub(crate) const CONSTRAINT_NAME_INVERTED_INDEX: &str = "INVERTED INDEX";
+/// Inverted index type
+pub(crate) const INDEX_TYPE_INVERTED_INDEX: &str = "greptime-inverted-index-v1";
+
 /// Fulltext index constraint name
-pub(crate) const FULLTEXT_INDEX_CONSTRAINT_NAME: &str = "FULLTEXT INDEX";
+pub(crate) const CONSTRAINT_NAME_FULLTEXT_INDEX: &str = "FULLTEXT INDEX";
+/// Fulltext index v1 type
+pub(crate) const INDEX_TYPE_FULLTEXT_TANTIVY: &str = "greptime-fulltext-index-v1";
+/// Fulltext index bloom type
+pub(crate) const INDEX_TYPE_FULLTEXT_BLOOM: &str = "greptime-fulltext-index-bloom";
+
 /// Skipping index constraint name
-pub(crate) const SKIPPING_INDEX_CONSTRAINT_NAME: &str = "SKIPPING INDEX";
+pub(crate) const CONSTRAINT_NAME_SKIPPING_INDEX: &str = "SKIPPING INDEX";
+/// Skipping index type
+pub(crate) const INDEX_TYPE_SKIPPING_INDEX: &str = "greptime-bloom-filter-v1";

 /// The virtual table implementation for `information_schema.KEY_COLUMN_USAGE`.
+///
+/// Provides an extra column `greptime_index_type` for the index type of the key column.
 #[derive(Debug)]
 pub(super) struct InformationSchemaKeyColumnUsage {
    schema: SchemaRef,
@@ -121,6 +138,11 @@ impl InformationSchemaKeyColumnUsage {
                ConcreteDataType::string_datatype(),
                true,
            ),
+            ColumnSchema::new(
+                GREPTIME_INDEX_TYPE,
+                ConcreteDataType::string_datatype(),
+                true,
+            ),
        ]))
    }

@@ -185,6 +207,7 @@ struct InformationSchemaKeyColumnUsageBuilder {
    column_name: StringVectorBuilder,
    ordinal_position: UInt32VectorBuilder,
    position_in_unique_constraint: UInt32VectorBuilder,
+    greptime_index_type: StringVectorBuilder,
 }

 impl InformationSchemaKeyColumnUsageBuilder {
@@ -207,6 +230,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
            column_name: StringVectorBuilder::with_capacity(INIT_CAPACITY),
            ordinal_position: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
            position_in_unique_constraint: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
+            greptime_index_type: StringVectorBuilder::with_capacity(INIT_CAPACITY),
        }
    }

@@ -230,34 +254,47 @@ impl InformationSchemaKeyColumnUsageBuilder {

                for (idx, column) in schema.column_schemas().iter().enumerate() {
                    let mut constraints = vec![];
+                    let mut greptime_index_type = vec![];
                    if column.is_time_index() {
                        self.add_key_column_usage(
                            &predicates,
                            &schema_name,
-                            TIME_INDEX_CONSTRAINT_NAME,
+                            CONSTRAINT_NAME_TIME_INDEX,
                            &catalog_name,
                            &schema_name,
                            table_name,
                            &column.name,
                            1, //always 1 for time index
+                            "",
                        );
                    }
                    // TODO(dimbtp): foreign key constraint not supported yet
                    if keys.contains(&idx) {
-                        constraints.push(PRI_CONSTRAINT_NAME);
+                        constraints.push(CONSTRAINT_NAME_PRI);
+                        greptime_index_type.push(INDEX_TYPE_PRI);
                    }
                    if column.is_inverted_indexed() {
-                        constraints.push(INVERTED_INDEX_CONSTRAINT_NAME);
+                        constraints.push(CONSTRAINT_NAME_INVERTED_INDEX);
+                        greptime_index_type.push(INDEX_TYPE_INVERTED_INDEX);
                    }
-                    if column.is_fulltext_indexed() {
-                        constraints.push(FULLTEXT_INDEX_CONSTRAINT_NAME);
+                    if let Ok(Some(options)) = column.fulltext_options() {
+                        if options.enable {
+                            constraints.push(CONSTRAINT_NAME_FULLTEXT_INDEX);
+                            let index_type = match options.backend {
+                                FulltextBackend::Bloom => INDEX_TYPE_FULLTEXT_BLOOM,
+                                FulltextBackend::Tantivy => INDEX_TYPE_FULLTEXT_TANTIVY,
+                            };
+                            greptime_index_type.push(index_type);
+                        }
                    }
                    if column.is_skipping_indexed() {
-                        constraints.push(SKIPPING_INDEX_CONSTRAINT_NAME);
+                        constraints.push(CONSTRAINT_NAME_SKIPPING_INDEX);
+                        greptime_index_type.push(INDEX_TYPE_SKIPPING_INDEX);
                    }

                    if !constraints.is_empty() {
                        let aggregated_constraints = constraints.join(", ");
+                        let aggregated_index_types = greptime_index_type.join(", ");
                        self.add_key_column_usage(
                            &predicates,
                            &schema_name,
@@ -267,6 +304,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
                            table_name,
                            &column.name,
                            idx as u32 + 1,
+                            &aggregated_index_types,
                        );
                    }
                }
@@ -289,6 +327,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
        table_name: &str,
        column_name: &str,
        ordinal_position: u32,
+        index_types: &str,
    ) {
        let row = [
            (CONSTRAINT_SCHEMA, &Value::from(constraint_schema)),
@@ -298,6 +337,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
            (TABLE_NAME, &Value::from(table_name)),
            (COLUMN_NAME, &Value::from(column_name)),
            (ORDINAL_POSITION, &Value::from(ordinal_position)),
+            (GREPTIME_INDEX_TYPE, &Value::from(index_types)),
        ];

        if !predicates.eval(&row) {
@@ -314,6 +354,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
        self.column_name.push(Some(column_name));
        self.ordinal_position.push(Some(ordinal_position));
        self.position_in_unique_constraint.push(None);
+        self.greptime_index_type.push(Some(index_types));
    }

    fn finish(&mut self) -> Result<RecordBatch> {
@@ -337,6 +378,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
            null_string_vector.clone(),
            null_string_vector.clone(),
            null_string_vector,
+            Arc::new(self.greptime_index_type.finish()),
        ];
        RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu)
    }
--- a/src/catalog/src/system_schema/information_schema/partitions.rs
+++ b/src/catalog/src/system_schema/information_schema/partitions.rs
@@ -39,13 +39,12 @@ use snafu::{OptionExt, ResultExt};
 use store_api::storage::{ScanRequest, TableId};
 use table::metadata::{TableInfo, TableType};

-use super::PARTITIONS;
 use crate::error::{
    CreateRecordBatchSnafu, FindPartitionsSnafu, InternalSnafu, PartitionManagerNotFoundSnafu,
    Result, UpgradeWeakCatalogManagerRefSnafu,
 };
 use crate::kvbackend::KvBackendCatalogManager;
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, PARTITIONS};
 use crate::CatalogManager;

 const TABLE_CATALOG: &str = "table_catalog";
--- a/src/catalog/src/system_schema/information_schema/procedure_info.rs
+++ b/src/catalog/src/system_schema/information_schema/procedure_info.rs
@@ -33,9 +33,8 @@ use datatypes::vectors::{StringVectorBuilder, TimestampMillisecondVectorBuilder}
 use snafu::ResultExt;
 use store_api::storage::{ScanRequest, TableId};

-use super::PROCEDURE_INFO;
 use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, PROCEDURE_INFO};
 use crate::system_schema::utils;
 use crate::CatalogManager;

--- a/src/catalog/src/system_schema/information_schema/region_peers.rs
+++ b/src/catalog/src/system_schema/information_schema/region_peers.rs
@@ -35,13 +35,12 @@ use snafu::{OptionExt, ResultExt};
 use store_api::storage::{RegionId, ScanRequest, TableId};
 use table::metadata::TableType;

-use super::REGION_PEERS;
 use crate::error::{
    CreateRecordBatchSnafu, FindRegionRoutesSnafu, InternalSnafu, Result,
    UpgradeWeakCatalogManagerRefSnafu,
 };
 use crate::kvbackend::KvBackendCatalogManager;
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, REGION_PEERS};
 use crate::CatalogManager;

 pub const TABLE_CATALOG: &str = "table_catalog";
--- a/src/catalog/src/system_schema/information_schema/region_statistics.rs
+++ b/src/catalog/src/system_schema/information_schema/region_statistics.rs
@@ -30,9 +30,9 @@ use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder, UInt64VectorB
 use snafu::ResultExt;
 use store_api::storage::{ScanRequest, TableId};

-use super::{InformationTable, REGION_STATISTICS};
 use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
 use crate::information_schema::Predicates;
+use crate::system_schema::information_schema::{InformationTable, REGION_STATISTICS};
 use crate::system_schema::utils;
 use crate::CatalogManager;

--- a/src/catalog/src/system_schema/information_schema/runtime_metrics.rs
+++ b/src/catalog/src/system_schema/information_schema/runtime_metrics.rs
@@ -35,8 +35,8 @@ use itertools::Itertools;
 use snafu::ResultExt;
 use store_api::storage::{ScanRequest, TableId};

-use super::{InformationTable, RUNTIME_METRICS};
 use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
+use crate::system_schema::information_schema::{InformationTable, RUNTIME_METRICS};

 #[derive(Debug)]
 pub(super) struct InformationSchemaMetrics {
--- a/src/catalog/src/system_schema/information_schema/schemata.rs
+++ b/src/catalog/src/system_schema/information_schema/schemata.rs
@@ -31,12 +31,11 @@ use datatypes::vectors::StringVectorBuilder;
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::{ScanRequest, TableId};

-use super::SCHEMATA;
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, TableMetadataManagerSnafu,
    UpgradeWeakCatalogManagerRefSnafu,
 };
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, SCHEMATA};
 use crate::system_schema::utils;
 use crate::CatalogManager;

--- a/src/catalog/src/system_schema/information_schema/table_constraints.rs
+++ b/src/catalog/src/system_schema/information_schema/table_constraints.rs
@@ -32,14 +32,14 @@ use futures::TryStreamExt;
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::{ScanRequest, TableId};

-use super::{InformationTable, TABLE_CONSTRAINTS};
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
 use crate::information_schema::key_column_usage::{
-    PRI_CONSTRAINT_NAME, TIME_INDEX_CONSTRAINT_NAME,
+    CONSTRAINT_NAME_PRI, CONSTRAINT_NAME_TIME_INDEX,
 };
 use crate::information_schema::Predicates;
+use crate::system_schema::information_schema::{InformationTable, TABLE_CONSTRAINTS};
 use crate::CatalogManager;

 /// The `TABLE_CONSTRAINTS` table describes which tables have constraints.
@@ -188,7 +188,7 @@ impl InformationSchemaTableConstraintsBuilder {
                    self.add_table_constraint(
                        &predicates,
                        &schema_name,
-                        TIME_INDEX_CONSTRAINT_NAME,
+                        CONSTRAINT_NAME_TIME_INDEX,
                        &schema_name,
                        &table.table_info().name,
                        TIME_INDEX_CONSTRAINT_TYPE,
@@ -199,7 +199,7 @@ impl InformationSchemaTableConstraintsBuilder {
                    self.add_table_constraint(
                        &predicates,
                        &schema_name,
-                        PRI_CONSTRAINT_NAME,
+                        CONSTRAINT_NAME_PRI,
                        &schema_name,
                        &table.table_info().name,
                        PRI_KEY_CONSTRAINT_TYPE,
--- a/src/catalog/src/system_schema/information_schema/tables.rs
+++ b/src/catalog/src/system_schema/information_schema/tables.rs
@@ -38,11 +38,10 @@ use snafu::{OptionExt, ResultExt};
 use store_api::storage::{RegionId, ScanRequest, TableId};
 use table::metadata::{TableInfo, TableType};

-use super::TABLES;
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, TABLES};
 use crate::system_schema::utils;
 use crate::CatalogManager;

--- a/src/catalog/src/system_schema/information_schema/views.rs
+++ b/src/catalog/src/system_schema/information_schema/views.rs
@@ -32,13 +32,12 @@ use snafu::{OptionExt, ResultExt};
 use store_api::storage::{ScanRequest, TableId};
 use table::metadata::TableType;

-use super::VIEWS;
 use crate::error::{
    CastManagerSnafu, CreateRecordBatchSnafu, GetViewCacheSnafu, InternalSnafu, Result,
    UpgradeWeakCatalogManagerRefSnafu, ViewInfoNotFoundSnafu,
 };
 use crate::kvbackend::KvBackendCatalogManager;
-use crate::system_schema::information_schema::{InformationTable, Predicates};
+use crate::system_schema::information_schema::{InformationTable, Predicates, VIEWS};
 use crate::CatalogManager;
 const INIT_CAPACITY: usize = 42;

--- a/src/catalog/src/system_schema/memory_table.rs
+++ b/src/catalog/src/system_schema/memory_table.rs
@@ -29,8 +29,8 @@ use datatypes::vectors::VectorRef;
 use snafu::ResultExt;
 use store_api::storage::{ScanRequest, TableId};

-use super::SystemTable;
 use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
+use crate::system_schema::SystemTable;

 /// A memory table with specified schema and columns.
 #[derive(Debug)]
--- a/src/catalog/src/system_schema/pg_catalog.rs
+++ b/src/catalog/src/system_schema/pg_catalog.rs
@@ -34,9 +34,9 @@ use table::TableRef;
 pub use table_names::*;

 use self::pg_namespace::oid_map::{PGNamespaceOidMap, PGNamespaceOidMapRef};
-use super::memory_table::MemoryTable;
-use super::utils::tables::u32_column;
-use super::{SystemSchemaProvider, SystemSchemaProviderInner, SystemTableRef};
+use crate::system_schema::memory_table::MemoryTable;
+use crate::system_schema::utils::tables::u32_column;
+use crate::system_schema::{SystemSchemaProvider, SystemSchemaProviderInner, SystemTableRef};
 use crate::CatalogManager;

 lazy_static! {
--- a/src/catalog/src/system_schema/pg_catalog/pg_catalog_memory_table.rs
+++ b/src/catalog/src/system_schema/pg_catalog/pg_catalog_memory_table.rs
@@ -17,9 +17,9 @@ use std::sync::Arc;
 use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
 use datatypes::vectors::{Int16Vector, StringVector, UInt32Vector, VectorRef};

-use super::oid_column;
-use super::table_names::PG_TYPE;
 use crate::memory_table_cols;
+use crate::system_schema::pg_catalog::oid_column;
+use crate::system_schema::pg_catalog::table_names::PG_TYPE;
 use crate::system_schema::utils::tables::{i16_column, string_column};

 fn pg_type_schema_columns() -> (Vec<ColumnSchema>, Vec<VectorRef>) {
--- a/src/catalog/src/system_schema/pg_catalog/pg_class.rs
+++ b/src/catalog/src/system_schema/pg_catalog/pg_class.rs
@@ -32,12 +32,12 @@ use snafu::{OptionExt, ResultExt};
 use store_api::storage::ScanRequest;
 use table::metadata::TableType;

-use super::pg_namespace::oid_map::PGNamespaceOidMapRef;
-use super::{query_ctx, OID_COLUMN_NAME, PG_CLASS};
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
 use crate::information_schema::Predicates;
+use crate::system_schema::pg_catalog::pg_namespace::oid_map::PGNamespaceOidMapRef;
+use crate::system_schema::pg_catalog::{query_ctx, OID_COLUMN_NAME, PG_CLASS};
 use crate::system_schema::utils::tables::{string_column, u32_column};
 use crate::system_schema::SystemTable;
 use crate::CatalogManager;
--- a/src/catalog/src/system_schema/pg_catalog/pg_database.rs
+++ b/src/catalog/src/system_schema/pg_catalog/pg_database.rs
@@ -29,12 +29,12 @@ use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder, VectorRef};
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::ScanRequest;

-use super::pg_namespace::oid_map::PGNamespaceOidMapRef;
-use super::{query_ctx, OID_COLUMN_NAME, PG_DATABASE};
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
 use crate::information_schema::Predicates;
+use crate::system_schema::pg_catalog::pg_namespace::oid_map::PGNamespaceOidMapRef;
+use crate::system_schema::pg_catalog::{query_ctx, OID_COLUMN_NAME, PG_DATABASE};
 use crate::system_schema::utils::tables::{string_column, u32_column};
 use crate::system_schema::SystemTable;
 use crate::CatalogManager;
--- a/src/catalog/src/system_schema/pg_catalog/pg_namespace.rs
+++ b/src/catalog/src/system_schema/pg_catalog/pg_namespace.rs
@@ -35,11 +35,13 @@ use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder, VectorRef};
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::ScanRequest;

-use super::{query_ctx, PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE};
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
 use crate::information_schema::Predicates;
+use crate::system_schema::pg_catalog::{
+    query_ctx, PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE,
+};
 use crate::system_schema::utils::tables::{string_column, u32_column};
 use crate::system_schema::SystemTable;
 use crate::CatalogManager;
--- a/src/catalog/src/system_schema/predicate.rs
+++ b/src/catalog/src/system_schema/predicate.rs
@@ -437,10 +437,7 @@ mod tests {
    }

    fn column(name: &str) -> Expr {
-        Expr::Column(Column {
-            relation: None,
-            name: name.to_string(),
-        })
+        Expr::Column(Column::from_name(name))
    }

    fn string_literal(v: &str) -> Expr {
--- a/src/cli/src/error.rs
+++ b/src/cli/src/error.rs
@@ -17,7 +17,6 @@ use std::any::Any;
 use common_error::ext::{BoxedError, ErrorExt};
 use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
-use rustyline::error::ReadlineError;
 use snafu::{Location, Snafu};

 #[derive(Snafu)]
@@ -105,52 +104,6 @@ pub enum Error {
    #[snafu(display("Invalid REPL command: {reason}"))]
    InvalidReplCommand { reason: String },

-    #[snafu(display("Cannot create REPL"))]
-    ReplCreation {
-        #[snafu(source)]
-        error: ReadlineError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Error reading command"))]
-    Readline {
-        #[snafu(source)]
-        error: ReadlineError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Failed to request database, sql: {sql}"))]
-    RequestDatabase {
-        sql: String,
-        #[snafu(source)]
-        source: client::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Failed to collect RecordBatches"))]
-    CollectRecordBatches {
-        #[snafu(implicit)]
-        location: Location,
-        source: common_recordbatch::error::Error,
-    },
-
-    #[snafu(display("Failed to pretty print Recordbatches"))]
-    PrettyPrintRecordBatches {
-        #[snafu(implicit)]
-        location: Location,
-        source: common_recordbatch::error::Error,
-    },
-
-    #[snafu(display("Failed to start Meta client"))]
-    StartMetaClient {
-        #[snafu(implicit)]
-        location: Location,
-        source: meta_client::error::Error,
-    },
-
    #[snafu(display("Failed to parse SQL: {}", sql))]
    ParseSql {
        sql: String,
@@ -166,13 +119,6 @@ pub enum Error {
        source: query::error::Error,
    },

-    #[snafu(display("Failed to encode logical plan in substrait"))]
-    SubstraitEncodeLogicalPlan {
-        #[snafu(implicit)]
-        location: Location,
-        source: substrait::error::Error,
-    },
-
    #[snafu(display("Failed to load layered config"))]
    LoadLayeredConfig {
        #[snafu(source(from(common_config::error::Error, Box::new)))]
@@ -318,17 +264,10 @@ impl ErrorExt for Error {
            Error::StartProcedureManager { source, .. }
            | Error::StopProcedureManager { source, .. } => source.status_code(),
            Error::StartWalOptionsAllocator { source, .. } => source.status_code(),
-            Error::ReplCreation { .. } | Error::Readline { .. } | Error::HttpQuerySql { .. } => {
-                StatusCode::Internal
-            }
-            Error::RequestDatabase { source, .. } => source.status_code(),
-            Error::CollectRecordBatches { source, .. }
-            | Error::PrettyPrintRecordBatches { source, .. } => source.status_code(),
-            Error::StartMetaClient { source, .. } => source.status_code(),
+            Error::HttpQuerySql { .. } => StatusCode::Internal,
            Error::ParseSql { source, .. } | Error::PlanStatement { source, .. } => {
                source.status_code()
            }
-            Error::SubstraitEncodeLogicalPlan { source, .. } => source.status_code(),

            Error::SerdeJson { .. }
            | Error::FileIo { .. }
--- a/src/cli/src/lib.rs
+++ b/src/cli/src/lib.rs
@@ -23,15 +23,12 @@ mod helper;
 // Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373
 mod database;
 mod import;
-#[allow(unused)]
-mod repl;

 use async_trait::async_trait;
 use clap::Parser;
 use common_error::ext::BoxedError;
 pub use database::DatabaseClient;
 use error::Result;
-pub use repl::Repl;

 pub use crate::bench::BenchTableMetadataCommand;
 pub use crate::export::ExportCommand;
--- a/src/cli/src/repl.rs
+++ b/src/cli/src/repl.rs
@@ -1,299 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::path::PathBuf;
-use std::sync::Arc;
-use std::time::Instant;
-
-use cache::{
-    build_fundamental_cache_registry, with_default_composite_cache_registry, TABLE_CACHE_NAME,
-    TABLE_ROUTE_CACHE_NAME,
-};
-use catalog::information_extension::DistributedInformationExtension;
-use catalog::kvbackend::{
-    CachedKvBackend, CachedKvBackendBuilder, KvBackendCatalogManager, MetaKvBackend,
-};
-use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
-use common_base::Plugins;
-use common_config::Mode;
-use common_error::ext::ErrorExt;
-use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
-use common_meta::kv_backend::KvBackendRef;
-use common_query::Output;
-use common_recordbatch::RecordBatches;
-use common_telemetry::debug;
-use either::Either;
-use meta_client::client::{ClusterKvBackend, MetaClientBuilder};
-use query::datafusion::DatafusionQueryEngine;
-use query::parser::QueryLanguageParser;
-use query::query_engine::{DefaultSerializer, QueryEngineState};
-use query::QueryEngine;
-use rustyline::error::ReadlineError;
-use rustyline::Editor;
-use session::context::QueryContext;
-use snafu::{OptionExt, ResultExt};
-use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
-
-use crate::cmd::ReplCommand;
-use crate::error::{
-    CollectRecordBatchesSnafu, ParseSqlSnafu, PlanStatementSnafu, PrettyPrintRecordBatchesSnafu,
-    ReadlineSnafu, ReplCreationSnafu, RequestDatabaseSnafu, Result, StartMetaClientSnafu,
-    SubstraitEncodeLogicalPlanSnafu,
-};
-use crate::helper::RustylineHelper;
-use crate::{error, AttachCommand};
-
-/// Captures the state of the repl, gathers commands and executes them one by one
-pub struct Repl {
-    /// Rustyline editor for interacting with user on command line
-    rl: Editor<RustylineHelper>,
-
-    /// Current prompt
-    prompt: String,
-
-    /// Client for interacting with GreptimeDB
-    database: Database,
-
-    query_engine: Option<DatafusionQueryEngine>,
-}
-
-#[allow(clippy::print_stdout)]
-impl Repl {
-    fn print_help(&self) {
-        println!("{}", ReplCommand::help())
-    }
-
-    pub(crate) async fn try_new(cmd: &AttachCommand) -> Result<Self> {
-        let mut rl = Editor::new().context(ReplCreationSnafu)?;
-
-        if !cmd.disable_helper {
-            rl.set_helper(Some(RustylineHelper::default()));
-
-            let history_file = history_file();
-            if let Err(e) = rl.load_history(&history_file) {
-                debug!(
-                    "failed to load history file on {}, error: {e}",
-                    history_file.display()
-                );
-            }
-        }
-
-        let client = Client::with_urls([&cmd.grpc_addr]);
-        let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
-
-        let query_engine = if let Some(meta_addr) = &cmd.meta_addr {
-            create_query_engine(meta_addr).await.map(Some)?
-        } else {
-            None
-        };
-
-        Ok(Self {
-            rl,
-            prompt: "> ".to_string(),
-            database,
-            query_engine,
-        })
-    }
-
-    /// Parse the next command
-    fn next_command(&mut self) -> Result<ReplCommand> {
-        match self.rl.readline(&self.prompt) {
-            Ok(ref line) => {
-                let request = line.trim();
-
-                let _ = self.rl.add_history_entry(request.to_string());
-
-                request.try_into()
-            }
-            Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => Ok(ReplCommand::Exit),
-            // Some sort of real underlying error
-            Err(e) => Err(e).context(ReadlineSnafu),
-        }
-    }
-
-    /// Read Evaluate Print Loop (interactive command line) for GreptimeDB
-    ///
-    /// Inspired / based on repl.rs from InfluxDB IOX
-    pub(crate) async fn run(&mut self) -> Result<()> {
-        println!("Ready for commands. (Hint: try 'help')");
-
-        loop {
-            match self.next_command()? {
-                ReplCommand::Help => {
-                    self.print_help();
-                }
-                ReplCommand::UseDatabase { db_name } => {
-                    if self.execute_sql(format!("USE {db_name}")).await {
-                        println!("Using {db_name}");
-                        self.database.set_schema(&db_name);
-                        self.prompt = format!("[{db_name}] > ");
-                    }
-                }
-                ReplCommand::Sql { sql } => {
-                    let _ = self.execute_sql(sql).await;
-                }
-                ReplCommand::Exit => {
-                    return Ok(());
-                }
-            }
-        }
-    }
-
-    async fn execute_sql(&self, sql: String) -> bool {
-        self.do_execute_sql(sql)
-            .await
-            .map_err(|e| {
-                let status_code = e.status_code();
-                let root_cause = e.output_msg();
-                println!("Error: {}({status_code}), {root_cause}", status_code as u32)
-            })
-            .is_ok()
-    }
-
-    async fn do_execute_sql(&self, sql: String) -> Result<()> {
-        let start = Instant::now();
-
-        let output = if let Some(query_engine) = &self.query_engine {
-            let query_ctx = Arc::new(QueryContext::with(
-                self.database.catalog(),
-                self.database.schema(),
-            ));
-
-            let stmt = QueryLanguageParser::parse_sql(&sql, &query_ctx)
-                .with_context(|_| ParseSqlSnafu { sql: sql.clone() })?;
-
-            let plan = query_engine
-                .planner()
-                .plan(&stmt, query_ctx.clone())
-                .await
-                .context(PlanStatementSnafu)?;
-
-            let plan = query_engine
-                .optimize(&query_engine.engine_context(query_ctx), &plan)
-                .context(PlanStatementSnafu)?;
-
-            let plan = DFLogicalSubstraitConvertor {}
-                .encode(&plan, DefaultSerializer)
-                .context(SubstraitEncodeLogicalPlanSnafu)?;
-
-            self.database.logical_plan(plan.to_vec()).await
-        } else {
-            self.database.sql(&sql).await
-        }
-        .context(RequestDatabaseSnafu { sql: &sql })?;
-
-        let either = match output.data {
-            OutputData::Stream(s) => {
-                let x = RecordBatches::try_collect(s)
-                    .await
-                    .context(CollectRecordBatchesSnafu)?;
-                Either::Left(x)
-            }
-            OutputData::RecordBatches(x) => Either::Left(x),
-            OutputData::AffectedRows(rows) => Either::Right(rows),
-        };
-
-        let end = Instant::now();
-
-        match either {
-            Either::Left(recordbatches) => {
-                let total_rows: usize = recordbatches.iter().map(|x| x.num_rows()).sum();
-                if total_rows > 0 {
-                    println!(
-                        "{}",
-                        recordbatches
-                            .pretty_print()
-                            .context(PrettyPrintRecordBatchesSnafu)?
-                    );
-                }
-                println!("Total Rows: {total_rows}")
-            }
-            Either::Right(rows) => println!("Affected Rows: {rows}"),
-        };
-
-        println!("Cost {} ms", (end - start).as_millis());
-        Ok(())
-    }
-}
-
-impl Drop for Repl {
-    fn drop(&mut self) {
-        if self.rl.helper().is_some() {
-            let history_file = history_file();
-            if let Err(e) = self.rl.save_history(&history_file) {
-                debug!(
-                    "failed to save history file on {}, error: {e}",
-                    history_file.display()
-                );
-            }
-        }
-    }
-}
-
-/// Return the location of the history file (defaults to $HOME/".greptimedb_cli_history")
-fn history_file() -> PathBuf {
-    let mut buf = match std::env::var("HOME") {
-        Ok(home) => PathBuf::from(home),
-        Err(_) => PathBuf::new(),
-    };
-    buf.push(".greptimedb_cli_history");
-    buf
-}
-
-async fn create_query_engine(meta_addr: &str) -> Result<DatafusionQueryEngine> {
-    let mut meta_client = MetaClientBuilder::default().enable_store().build();
-    meta_client
-        .start([meta_addr])
-        .await
-        .context(StartMetaClientSnafu)?;
-    let meta_client = Arc::new(meta_client);
-
-    let cached_meta_backend = Arc::new(
-        CachedKvBackendBuilder::new(Arc::new(MetaKvBackend::new(meta_client.clone()))).build(),
-    );
-    let layered_cache_builder = LayeredCacheRegistryBuilder::default().add_cache_registry(
-        CacheRegistryBuilder::default()
-            .add_cache(cached_meta_backend.clone())
-            .build(),
-    );
-    let fundamental_cache_registry =
-        build_fundamental_cache_registry(Arc::new(MetaKvBackend::new(meta_client.clone())));
-    let layered_cache_registry = Arc::new(
-        with_default_composite_cache_registry(
-            layered_cache_builder.add_cache_registry(fundamental_cache_registry),
-        )
-        .context(error::BuildCacheRegistrySnafu)?
-        .build(),
-    );
-
-    let information_extension = Arc::new(DistributedInformationExtension::new(meta_client.clone()));
-    let catalog_manager = KvBackendCatalogManager::new(
-        information_extension,
-        cached_meta_backend.clone(),
-        layered_cache_registry,
-        None,
-    );
-    let plugins: Plugins = Default::default();
-    let state = Arc::new(QueryEngineState::new(
-        catalog_manager,
-        None,
-        None,
-        None,
-        None,
-        false,
-        plugins.clone(),
-    ));
-
-    Ok(DatafusionQueryEngine::new(state, plugins))
-}
--- a/src/client/Cargo.toml
+++ b/src/client/Cargo.toml
@@ -16,6 +16,7 @@ arc-swap = "1.6"
 arrow-flight.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
+base64.workspace = true
 common-catalog.workspace = true
 common-error.workspace = true
 common-grpc.workspace = true
@@ -25,6 +26,7 @@ common-query.workspace = true
 common-recordbatch.workspace = true
 common-telemetry.workspace = true
 enum_dispatch = "0.3"
+futures.workspace = true
 futures-util.workspace = true
 lazy_static.workspace = true
 moka = { workspace = true, features = ["future"] }
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -12,36 +12,49 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::pin::Pin;
+use std::str::FromStr;
+
 use api::v1::auth_header::AuthScheme;
 use api::v1::ddl_request::Expr as DdlExpr;
 use api::v1::greptime_database_client::GreptimeDatabaseClient;
 use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
 use api::v1::{
-    AlterTableExpr, AuthHeader, CreateTableExpr, DdlRequest, GreptimeRequest, InsertRequests,
-    QueryRequest, RequestHeader,
+    AlterTableExpr, AuthHeader, Basic, CreateTableExpr, DdlRequest, GreptimeRequest,
+    InsertRequests, QueryRequest, RequestHeader,
 };
-use arrow_flight::Ticket;
+use arrow_flight::{FlightData, Ticket};
 use async_stream::stream;
+use base64::prelude::BASE64_STANDARD;
+use base64::Engine;
+use common_catalog::build_db_string;
+use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_error::ext::{BoxedError, ErrorExt};
+use common_grpc::flight::do_put::DoPutResponse;
 use common_grpc::flight::{FlightDecoder, FlightMessage};
 use common_query::Output;
 use common_recordbatch::error::ExternalSnafu;
 use common_recordbatch::RecordBatchStreamWrapper;
 use common_telemetry::error;
 use common_telemetry::tracing_context::W3cTrace;
-use futures_util::StreamExt;
+use futures::future;
+use futures_util::{Stream, StreamExt, TryStreamExt};
 use prost::Message;
 use snafu::{ensure, ResultExt};
-use tonic::metadata::AsciiMetadataKey;
+use tonic::metadata::{AsciiMetadataKey, MetadataValue};
 use tonic::transport::Channel;

 use crate::error::{
    ConvertFlightDataSnafu, Error, FlightGetSnafu, IllegalFlightMessagesSnafu, InvalidAsciiSnafu,
-    ServerSnafu,
+    InvalidTonicMetadataValueSnafu, ServerSnafu,
 };
 use crate::{from_grpc_response, Client, Result};

+type FlightDataStream = Pin<Box<dyn Stream<Item = FlightData> + Send>>;
+
+type DoPutResponseStream = Pin<Box<dyn Stream<Item = Result<DoPutResponse>>>>;
+
 #[derive(Clone, Debug, Default)]
 pub struct Database {
    // The "catalog" and "schema" to be used in processing the requests at the server side.
@@ -108,16 +121,24 @@ impl Database {
        self.catalog = catalog.into();
    }

-    pub fn catalog(&self) -> &String {
-        &self.catalog
+    fn catalog_or_default(&self) -> &str {
+        if self.catalog.is_empty() {
+            DEFAULT_CATALOG_NAME
+        } else {
+            &self.catalog
+        }
    }

    pub fn set_schema(&mut self, schema: impl Into<String>) {
        self.schema = schema.into();
    }

-    pub fn schema(&self) -> &String {
-        &self.schema
+    fn schema_or_default(&self) -> &str {
+        if self.schema.is_empty() {
+            DEFAULT_SCHEMA_NAME
+        } else {
+            &self.schema
+        }
    }

    pub fn set_timezone(&mut self, timezone: impl Into<String>) {
@@ -164,7 +185,7 @@ impl Database {
        from_grpc_response(response)
    }

-    async fn handle(&self, request: Request) -> Result<u32> {
+    pub async fn handle(&self, request: Request) -> Result<u32> {
        let mut client = make_database_client(&self.client)?.inner;
        let request = self.to_rpc_request(request);
        let response = client.handle(request).await?.into_inner();
@@ -310,6 +331,41 @@ impl Database {
            }
        }
    }
+
+    /// Ingest a stream of [RecordBatch]es that belong to a table, using Arrow Flight's "`DoPut`"
+    /// method. The return value is also a stream, produces [DoPutResponse]s.
+    pub async fn do_put(&self, stream: FlightDataStream) -> Result<DoPutResponseStream> {
+        let mut request = tonic::Request::new(stream);
+
+        if let Some(AuthHeader {
+            auth_scheme: Some(AuthScheme::Basic(Basic { username, password })),
+        }) = &self.ctx.auth_header
+        {
+            let encoded = BASE64_STANDARD.encode(format!("{username}:{password}"));
+            let value =
+                MetadataValue::from_str(&encoded).context(InvalidTonicMetadataValueSnafu)?;
+            request.metadata_mut().insert("x-greptime-auth", value);
+        }
+
+        let db_to_put = if !self.dbname.is_empty() {
+            &self.dbname
+        } else {
+            &build_db_string(self.catalog_or_default(), self.schema_or_default())
+        };
+        request.metadata_mut().insert(
+            "x-greptime-db-name",
+            MetadataValue::from_str(db_to_put).context(InvalidTonicMetadataValueSnafu)?,
+        );
+
+        let mut client = self.client.make_flight_client()?;
+        let response = client.mut_inner().do_put(request).await?;
+        let response = response
+            .into_inner()
+            .map_err(Into::into)
+            .and_then(|x| future::ready(DoPutResponse::try_from(x).context(ConvertFlightDataSnafu)))
+            .boxed();
+        Ok(response)
+    }
 }

 #[derive(Default, Debug, Clone)]
--- a/src/client/src/error.rs
+++ b/src/client/src/error.rs
@@ -15,10 +15,11 @@
 use std::any::Any;

 use common_error::ext::{BoxedError, ErrorExt};
-use common_error::status_code::StatusCode;
+use common_error::status_code::{convert_tonic_code_to_status_code, StatusCode};
 use common_error::{GREPTIME_DB_HEADER_ERROR_CODE, GREPTIME_DB_HEADER_ERROR_MSG};
 use common_macro::stack_trace_debug;
 use snafu::{location, Location, Snafu};
+use tonic::metadata::errors::InvalidMetadataValue;
 use tonic::{Code, Status};

 #[derive(Snafu)]
@@ -115,6 +116,14 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Invalid Tonic metadata value"))]
+    InvalidTonicMetadataValue {
+        #[snafu(source)]
+        error: InvalidMetadataValue,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -135,7 +144,9 @@ impl ErrorExt for Error {
            | Error::CreateTlsChannel { source, .. } => source.status_code(),
            Error::IllegalGrpcClientState { .. } => StatusCode::Unexpected,

-            Error::InvalidAscii { .. } => StatusCode::InvalidArguments,
+            Error::InvalidAscii { .. } | Error::InvalidTonicMetadataValue { .. } => {
+                StatusCode::InvalidArguments
+            }
        }
    }

@@ -152,15 +163,15 @@ impl From<Status> for Error {
                .and_then(|v| String::from_utf8(v.as_bytes().to_vec()).ok())
        }

-        let code = get_metadata_value(&e, GREPTIME_DB_HEADER_ERROR_CODE)
-            .and_then(|s| {
-                if let Ok(code) = s.parse::<u32>() {
-                    StatusCode::from_u32(code)
-                } else {
-                    None
-                }
-            })
-            .unwrap_or(StatusCode::Unknown);
+        let code = get_metadata_value(&e, GREPTIME_DB_HEADER_ERROR_CODE).and_then(|s| {
+            if let Ok(code) = s.parse::<u32>() {
+                StatusCode::from_u32(code)
+            } else {
+                None
+            }
+        });
+        let tonic_code = e.code();
+        let code = code.unwrap_or_else(|| convert_tonic_code_to_status_code(tonic_code));

        let msg = get_metadata_value(&e, GREPTIME_DB_HEADER_ERROR_MSG)
            .unwrap_or_else(|| e.message().to_string());
@@ -187,9 +198,6 @@ impl Error {
            } | Self::RegionServer {
                code: Code::Unavailable,
                ..
-            } | Self::RegionServer {
-                code: Code::Unknown,
-                ..
            }
        )
    }
--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -16,7 +16,7 @@

 mod client;
 pub mod client_manager;
-mod database;
+pub mod database;
 pub mod error;
 pub mod flow;
 pub mod load_balance;
--- a/src/client/src/region.rs
+++ b/src/client/src/region.rs
@@ -201,12 +201,11 @@ impl RegionRequester {
            .await
            .map_err(|e| {
                let code = e.code();
-                let err: error::Error = e.into();
                // Uses `Error::RegionServer` instead of `Error::Server`
                error::Error::RegionServer {
                    addr,
                    code,
-                    source: BoxedError::new(err),
+                    source: BoxedError::new(error::Error::from(e)),
                    location: location!(),
                }
            })?
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -68,7 +68,6 @@ query.workspace = true
 rand.workspace = true
 regex.workspace = true
 reqwest.workspace = true
-rustyline = "10.1"
 serde.workspace = true
 serde_json.workspace = true
 servers.workspace = true
--- a/src/cmd/src/error.rs
+++ b/src/cmd/src/error.rs
@@ -17,7 +17,6 @@ use std::any::Any;
 use common_error::ext::{BoxedError, ErrorExt};
 use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
-use rustyline::error::ReadlineError;
 use snafu::{Location, Snafu};

 #[derive(Snafu)]
@@ -79,6 +78,13 @@ pub enum Error {
        source: datanode::error::Error,
    },

+    #[snafu(display("Failed to build object storage manager"))]
+    BuildObjectStorageManager {
+        #[snafu(implicit)]
+        location: Location,
+        source: datanode::error::Error,
+    },
+
    #[snafu(display("Failed to shutdown datanode"))]
    ShutdownDatanode {
        #[snafu(implicit)]
@@ -181,52 +187,6 @@ pub enum Error {
    #[snafu(display("Invalid REPL command: {reason}"))]
    InvalidReplCommand { reason: String },

-    #[snafu(display("Cannot create REPL"))]
-    ReplCreation {
-        #[snafu(source)]
-        error: ReadlineError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Error reading command"))]
-    Readline {
-        #[snafu(source)]
-        error: ReadlineError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Failed to request database, sql: {sql}"))]
-    RequestDatabase {
-        sql: String,
-        #[snafu(source)]
-        source: client::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Failed to collect RecordBatches"))]
-    CollectRecordBatches {
-        #[snafu(implicit)]
-        location: Location,
-        source: common_recordbatch::error::Error,
-    },
-
-    #[snafu(display("Failed to pretty print Recordbatches"))]
-    PrettyPrintRecordBatches {
-        #[snafu(implicit)]
-        location: Location,
-        source: common_recordbatch::error::Error,
-    },
-
-    #[snafu(display("Failed to start Meta client"))]
-    StartMetaClient {
-        #[snafu(implicit)]
-        location: Location,
-        source: meta_client::error::Error,
-    },
-
    #[snafu(display("Failed to parse SQL: {}", sql))]
    ParseSql {
        sql: String,
@@ -242,13 +202,6 @@ pub enum Error {
        source: query::error::Error,
    },

-    #[snafu(display("Failed to encode logical plan in substrait"))]
-    SubstraitEncodeLogicalPlan {
-        #[snafu(implicit)]
-        location: Location,
-        source: substrait::error::Error,
-    },
-
    #[snafu(display("Failed to load layered config"))]
    LoadLayeredConfig {
        #[snafu(source(from(common_config::error::Error, Box::new)))]
@@ -382,6 +335,8 @@ impl ErrorExt for Error {
                source.status_code()
            }

+            Error::BuildObjectStorageManager { source, .. } => source.status_code(),
+
            Error::MissingConfig { .. }
            | Error::LoadLayeredConfig { .. }
            | Error::IllegalConfig { .. }
@@ -395,17 +350,10 @@ impl ErrorExt for Error {
            | Error::StopProcedureManager { source, .. } => source.status_code(),
            Error::BuildWalOptionsAllocator { source, .. }
            | Error::StartWalOptionsAllocator { source, .. } => source.status_code(),
-            Error::ReplCreation { .. } | Error::Readline { .. } | Error::HttpQuerySql { .. } => {
-                StatusCode::Internal
-            }
-            Error::RequestDatabase { source, .. } => source.status_code(),
-            Error::CollectRecordBatches { source, .. }
-            | Error::PrettyPrintRecordBatches { source, .. } => source.status_code(),
-            Error::StartMetaClient { source, .. } => source.status_code(),
+            Error::HttpQuerySql { .. } => StatusCode::Internal,
            Error::ParseSql { source, .. } | Error::PlanStatement { source, .. } => {
                source.status_code()
            }
-            Error::SubstraitEncodeLogicalPlan { source, .. } => source.status_code(),

            Error::SerdeJson { .. }
            | Error::FileIo { .. }
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -32,7 +32,9 @@ use common_meta::key::TableMetadataManager;
 use common_telemetry::info;
 use common_telemetry::logging::TracingOptions;
 use common_version::{short_version, version};
-use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
+use flow::{
+    FlownodeBuilder, FlownodeInstance, FlownodeServiceBuilder, FrontendClient, FrontendInvoker,
+};
 use meta_client::{MetaClientOptions, MetaClientType};
 use snafu::{ensure, OptionExt, ResultExt};
 use tracing_appender::non_blocking::WorkerGuard;
@@ -313,16 +315,26 @@ impl StartCommand {
        );

        let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
+        let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
        let flownode_builder = FlownodeBuilder::new(
-            opts,
+            opts.clone(),
            Plugins::new(),
            table_metadata_manager,
            catalog_manager.clone(),
            flow_metadata_manager,
+            Arc::new(frontend_client),
        )
        .with_heartbeat_task(heartbeat_task);

-        let flownode = flownode_builder.build().await.context(StartFlownodeSnafu)?;
+        let mut flownode = flownode_builder.build().await.context(StartFlownodeSnafu)?;
+        let services = FlownodeServiceBuilder::new(&opts)
+            .with_grpc_server(flownode.flownode_server().clone())
+            .enable_http_service()
+            .build()
+            .await
+            .context(StartFlownodeSnafu)?;
+        flownode.setup_services(services);
+        let flownode = flownode;

        // flownode's frontend to datanode need not timeout.
        // Some queries are expected to take long time.
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -44,6 +44,7 @@ use common_meta::peer::Peer;
 use common_meta::region_keeper::MemoryRegionKeeper;
 use common_meta::region_registry::LeaderRegionRegistry;
 use common_meta::sequence::SequenceBuilder;
+use common_meta::snapshot::MetadataSnapshotManager;
 use common_meta::wal_options_allocator::{build_wal_options_allocator, WalOptionsAllocatorRef};
 use common_procedure::{ProcedureInfo, ProcedureManagerRef};
 use common_telemetry::info;
@@ -55,7 +56,10 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
 use datanode::datanode::{Datanode, DatanodeBuilder};
 use datanode::region_server::RegionServer;
 use file_engine::config::EngineConfig as FileEngineConfig;
-use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
+use flow::{
+    FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeInstance, FlownodeOptions,
+    FrontendClient, FrontendInvoker,
+};
 use frontend::frontend::{Frontend, FrontendOptions};
 use frontend::instance::builder::FrontendBuilder;
 use frontend::instance::{Instance as FeInstance, StandaloneDatanodeManager};
@@ -74,10 +78,10 @@ use servers::http::HttpOptions;
 use servers::tls::{TlsMode, TlsOption};
 use servers::Mode;
 use snafu::ResultExt;
-use tokio::sync::{broadcast, RwLock};
+use tokio::sync::RwLock;
 use tracing_appender::non_blocking::WorkerGuard;

-use crate::error::Result;
+use crate::error::{Result, StartFlownodeSnafu};
 use crate::options::{GlobalOptions, GreptimeOptions};
 use crate::{error, log_versions, App};

@@ -244,9 +248,7 @@ impl StandaloneOptions {
 pub struct Instance {
    datanode: Datanode,
    frontend: Frontend,
-    // TODO(discord9): wrapped it in flownode instance instead
-    flow_worker_manager: Arc<FlowWorkerManager>,
-    flow_shutdown: broadcast::Sender<()>,
+    flownode: FlownodeInstance,
    procedure_manager: ProcedureManagerRef,
    wal_options_allocator: WalOptionsAllocatorRef,
    // Keep the logging guard to prevent the worker from being dropped.
@@ -288,9 +290,7 @@ impl App for Instance {
            .await
            .context(error::StartFrontendSnafu)?;

-        self.flow_worker_manager
-            .clone()
-            .run_background(Some(self.flow_shutdown.subscribe()));
+        self.flownode.start().await.context(StartFlownodeSnafu)?;

        Ok(())
    }
@@ -311,14 +311,9 @@ impl App for Instance {
            .await
            .context(error::ShutdownDatanodeSnafu)?;

-        self.flow_shutdown
-            .send(())
-            .map_err(|_e| {
-                flow::error::InternalSnafu {
-                    reason: "Failed to send shutdown signal to flow worker manager, all receiver end already closed".to_string(),
-                }
-                .build()
-            })
+        self.flownode
+            .shutdown()
+            .await
            .context(error::ShutdownFlownodeSnafu)?;

        info!("Datanode instance stopped.");
@@ -503,6 +498,10 @@ impl StartCommand {
            .build(),
        );

+        let object_store_manager = DatanodeBuilder::build_object_store_manager(&dn_opts.storage)
+            .await
+            .context(error::BuildObjectStorageManagerSnafu)?;
+
        let datanode = DatanodeBuilder::new(dn_opts, plugins.clone(), Mode::Standalone)
            .with_kv_backend(kv_backend.clone())
            .with_cache_registry(layered_cache_registry.clone())
@@ -529,20 +528,24 @@ impl StartCommand {
            flow: opts.flow.clone(),
            ..Default::default()
        };
+
+        // TODO(discord9): for standalone not use grpc, but just somehow get a handler to frontend grpc client without
+        // actually make a connection
+        let fe_server_addr = fe_opts.grpc.bind_addr.clone();
+        let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
        let flow_builder = FlownodeBuilder::new(
            flownode_options,
            plugins.clone(),
            table_metadata_manager.clone(),
            catalog_manager.clone(),
            flow_metadata_manager.clone(),
+            Arc::new(frontend_client),
        );
-        let flownode = Arc::new(
-            flow_builder
-                .build()
-                .await
-                .map_err(BoxedError::new)
-                .context(error::OtherSnafu)?,
-        );
+        let flownode = flow_builder
+            .build()
+            .await
+            .map_err(BoxedError::new)
+            .context(error::OtherSnafu)?;

        // set the ref to query for the local flow state
        {
@@ -593,6 +596,11 @@ impl StartCommand {
        )
        .await?;

+        let metadata_snapshot_manager = MetadataSnapshotManager::new(
+            kv_backend.clone(),
+            object_store_manager.default_object_store().clone(),
+        );
+
        let fe_instance = FrontendBuilder::new(
            fe_opts.clone(),
            kv_backend.clone(),
@@ -603,6 +611,7 @@ impl StartCommand {
            StatementStatistics::new(opts.logging.slow_query.clone()),
        )
        .with_plugin(plugins.clone())
+        .with_metadata_snapshot_manager(metadata_snapshot_manager)
        .try_build()
        .await
        .context(error::StartFrontendSnafu)?;
@@ -622,8 +631,6 @@ impl StartCommand {
        .context(error::StartFlownodeSnafu)?;
        flow_worker_manager.set_frontend_invoker(invoker).await;

-        let (tx, _rx) = broadcast::channel(1);
-
        let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins))
            .context(error::ServersSnafu)?;

@@ -642,8 +649,7 @@ impl StartCommand {
        Ok(Instance {
            datanode,
            frontend,
-            flow_worker_manager,
-            flow_shutdown: tx,
+            flownode,
            procedure_manager,
            wal_options_allocator,
            _guard: guard,
@@ -784,6 +790,8 @@ impl InformationExtension for StandaloneInformationExtension {
                    sst_size: region_stat.sst_size,
                    index_size: region_stat.index_size,
                    region_manifest: region_stat.manifest.into(),
+                    data_topic_latest_entry_id: region_stat.data_topic_latest_entry_id,
+                    metadata_topic_latest_entry_id: region_stat.metadata_topic_latest_entry_id,
                }
            })
            .collect::<Vec<_>>();
--- a/src/cmd/tests/cli.rs
+++ b/src/cmd/tests/cli.rs
@@ -1,148 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#[cfg(target_os = "macos")]
-mod tests {
-    use std::path::PathBuf;
-    use std::process::{Command, Stdio};
-    use std::time::Duration;
-
-    use common_test_util::temp_dir::create_temp_dir;
-    use rexpect::session::PtyReplSession;
-
-    struct Repl {
-        repl: PtyReplSession,
-    }
-
-    impl Repl {
-        fn send_line(&mut self, line: &str) {
-            let _ = self.repl.send_line(line).unwrap();
-
-            // read a line to consume the prompt
-            let _ = self.read_line();
-        }
-
-        fn read_line(&mut self) -> String {
-            self.repl.read_line().unwrap()
-        }
-
-        fn read_expect(&mut self, expect: &str) {
-            assert_eq!(self.read_line(), expect);
-        }
-
-        fn read_contains(&mut self, pat: &str) {
-            assert!(self.read_line().contains(pat));
-        }
-    }
-
-    // TODO(LFC): Un-ignore this REPL test.
-    // Ignore this REPL test because some logical plans like create database are not supported yet in Datanode.
-    #[ignore]
-    #[test]
-    fn test_repl() {
-        let data_home = create_temp_dir("data");
-        let wal_dir = create_temp_dir("wal");
-
-        let mut bin_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-        bin_path.push("../../target/debug");
-        let bin_path = bin_path.to_str().unwrap();
-
-        let mut datanode = Command::new("./greptime")
-            .current_dir(bin_path)
-            .args([
-                "datanode",
-                "start",
-                "--rpc-bind-addr=0.0.0.0:4321",
-                "--node-id=1",
-                &format!("--data-home={}", data_home.path().display()),
-                &format!("--wal-dir={}", wal_dir.path().display()),
-            ])
-            .stdout(Stdio::null())
-            .spawn()
-            .unwrap();
-
-        // wait for Datanode actually started
-        std::thread::sleep(Duration::from_secs(3));
-
-        let mut repl_cmd = Command::new("./greptime");
-        let _ = repl_cmd.current_dir(bin_path).args([
-            "--log-level=off",
-            "cli",
-            "attach",
-            "--grpc-bind-addr=0.0.0.0:4321",
-            // history commands can sneaky into stdout and mess up our tests, so disable it
-            "--disable-helper",
-        ]);
-        let pty_session = rexpect::session::spawn_command(repl_cmd, Some(5_000)).unwrap();
-        let repl = PtyReplSession {
-            prompt: "> ".to_string(),
-            pty_session,
-            quit_command: None,
-            echo_on: false,
-        };
-        let repl = &mut Repl { repl };
-        repl.read_expect("Ready for commands. (Hint: try 'help')");
-
-        test_create_database(repl);
-
-        test_use_database(repl);
-
-        test_create_table(repl);
-
-        test_insert(repl);
-
-        test_select(repl);
-
-        datanode.kill().unwrap();
-        let _ = datanode.wait().unwrap();
-    }
-
-    fn test_create_database(repl: &mut Repl) {
-        repl.send_line("CREATE DATABASE db;");
-        repl.read_expect("Affected Rows: 1");
-        repl.read_contains("Cost");
-    }
-
-    fn test_use_database(repl: &mut Repl) {
-        repl.send_line("USE db");
-        repl.read_expect("Total Rows: 0");
-        repl.read_contains("Cost");
-        repl.read_expect("Using db");
-    }
-
-    fn test_create_table(repl: &mut Repl) {
-        repl.send_line("CREATE TABLE t(x STRING, ts TIMESTAMP TIME INDEX);");
-        repl.read_expect("Affected Rows: 0");
-        repl.read_contains("Cost");
-    }
-
-    fn test_insert(repl: &mut Repl) {
-        repl.send_line("INSERT INTO t(x, ts) VALUES ('hello', 1676895812239);");
-        repl.read_expect("Affected Rows: 1");
-        repl.read_contains("Cost");
-    }
-
-    fn test_select(repl: &mut Repl) {
-        repl.send_line("SELECT * FROM t;");
-
-        repl.read_expect("+-------+-------------------------+");
-        repl.read_expect("| x     | ts                      |");
-        repl.read_expect("+-------+-------------------------+");
-        repl.read_expect("| hello | 2023-02-20T12:23:32.239 |");
-        repl.read_expect("+-------+-------------------------+");
-        repl.read_expect("Total Rows: 1");
-
-        repl.read_contains("Cost");
-    }
-}
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -168,8 +168,8 @@ fn test_load_metasrv_example_config() {
                tracing_sample_ratio: Some(Default::default()),
                slow_query: SlowQueryOptions {
                    enable: false,
-                    threshold: Some(Duration::from_secs(10)),
-                    sample_ratio: Some(1.0),
+                    threshold: None,
+                    sample_ratio: None,
                },
                ..Default::default()
            },
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -137,4 +137,12 @@ pub const SPAN_ID_COLUMN: &str = "span_id";
 pub const SPAN_NAME_COLUMN: &str = "span_name";
 pub const SERVICE_NAME_COLUMN: &str = "service_name";
 pub const PARENT_SPAN_ID_COLUMN: &str = "parent_span_id";
+pub const TRACE_TABLE_NAME: &str = "opentelemetry_traces";
+pub const TRACE_TABLE_NAME_SESSION_KEY: &str = "trace_table_name";
+// ---- End of special table and fields ----
+
+/// Generate the trace services table name from the trace table name by adding `_services` suffix.
+pub fn trace_services_table_name(trace_table_name: &str) -> String {
+    format!("{}_services", trace_table_name)
+}
 // ---- End of special table and fields ----
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -31,7 +31,8 @@ derive_builder.workspace = true
 futures.workspace = true
 lazy_static.workspace = true
 object-store.workspace = true
-orc-rust = { version = "0.5", default-features = false, features = [
+object_store_opendal.workspace = true
+orc-rust = { git = "https://github.com/datafusion-contrib/orc-rust", rev = "3134cab581a8e91b942d6a23aca2916ea965f6bb", default-features = false, features = [
    "async",
 ] }
 parquet.workspace = true
--- a/src/common/datasource/src/compression.rs
+++ b/src/common/datasource/src/compression.rs
@@ -19,6 +19,7 @@ use std::str::FromStr;
 use async_compression::tokio::bufread::{BzDecoder, GzipDecoder, XzDecoder, ZstdDecoder};
 use async_compression::tokio::write;
 use bytes::Bytes;
+use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
 use futures::Stream;
 use serde::{Deserialize, Serialize};
 use strum::EnumIter;
@@ -192,3 +193,15 @@ macro_rules! impl_compression_type {
 }

 impl_compression_type!((Gzip, Gzip), (Bzip2, Bz), (Xz, Xz), (Zstd, Zstd));
+
+impl From<CompressionType> for FileCompressionType {
+    fn from(t: CompressionType) -> Self {
+        match t {
+            CompressionType::Gzip => FileCompressionType::GZIP,
+            CompressionType::Bzip2 => FileCompressionType::BZIP2,
+            CompressionType::Xz => FileCompressionType::XZ,
+            CompressionType::Zstd => FileCompressionType::ZSTD,
+            CompressionType::Uncompressed => FileCompressionType::UNCOMPRESSED,
+        }
+    }
+}
--- a/src/common/datasource/src/file_format/csv.rs
+++ b/src/common/datasource/src/file_format/csv.rs
@@ -14,28 +14,23 @@

 use std::collections::HashMap;
 use std::str::FromStr;
-use std::sync::Arc;

 use arrow::csv;
 use arrow::csv::reader::Format;
 use arrow::record_batch::RecordBatch;
-use arrow_schema::{Schema, SchemaRef};
+use arrow_schema::Schema;
 use async_trait::async_trait;
 use common_runtime;
-use datafusion::datasource::physical_plan::{FileMeta, FileOpenFuture, FileOpener};
-use datafusion::error::Result as DataFusionResult;
 use datafusion::physical_plan::SendableRecordBatchStream;
-use derive_builder::Builder;
 use object_store::ObjectStore;
 use snafu::ResultExt;
 use tokio_util::compat::FuturesAsyncReadCompatExt;
 use tokio_util::io::SyncIoBridge;

-use super::stream_to_file;
 use crate::buffered_writer::DfRecordBatchEncoder;
 use crate::compression::CompressionType;
 use crate::error::{self, Result};
-use crate::file_format::{self, open_with_decoder, FileFormat};
+use crate::file_format::{self, stream_to_file, FileFormat};
 use crate::share_buffer::SharedBuffer;

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -100,66 +95,6 @@ impl Default for CsvFormat {
    }
 }

-#[derive(Debug, Clone, Builder)]
-pub struct CsvConfig {
-    batch_size: usize,
-    file_schema: SchemaRef,
-    #[builder(default = "None")]
-    file_projection: Option<Vec<usize>>,
-    #[builder(default = "true")]
-    has_header: bool,
-    #[builder(default = "b','")]
-    delimiter: u8,
-}
-
-impl CsvConfig {
-    fn builder(&self) -> csv::ReaderBuilder {
-        let mut builder = csv::ReaderBuilder::new(self.file_schema.clone())
-            .with_delimiter(self.delimiter)
-            .with_batch_size(self.batch_size)
-            .with_header(self.has_header);
-
-        if let Some(proj) = &self.file_projection {
-            builder = builder.with_projection(proj.clone());
-        }
-
-        builder
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct CsvOpener {
-    config: Arc<CsvConfig>,
-    object_store: Arc<ObjectStore>,
-    compression_type: CompressionType,
-}
-
-impl CsvOpener {
-    /// Return a new [`CsvOpener`]. The caller must ensure [`CsvConfig`].file_schema must correspond to the opening file.
-    pub fn new(
-        config: CsvConfig,
-        object_store: ObjectStore,
-        compression_type: CompressionType,
-    ) -> Self {
-        CsvOpener {
-            config: Arc::new(config),
-            object_store: Arc::new(object_store),
-            compression_type,
-        }
-    }
-}
-
-impl FileOpener for CsvOpener {
-    fn open(&self, meta: FileMeta) -> DataFusionResult<FileOpenFuture> {
-        open_with_decoder(
-            self.object_store.clone(),
-            meta.location().to_string(),
-            self.compression_type,
-            || Ok(self.config.builder().build_decoder()),
-        )
-    }
-}
-
 #[async_trait]
 impl FileFormat for CsvFormat {
    async fn infer_schema(&self, store: &ObjectStore, path: &str) -> Result<Schema> {
--- a/src/common/datasource/src/file_format/json.rs
+++ b/src/common/datasource/src/file_format/json.rs
@@ -15,29 +15,24 @@
 use std::collections::HashMap;
 use std::io::BufReader;
 use std::str::FromStr;
-use std::sync::Arc;

-use arrow::datatypes::SchemaRef;
+use arrow::json;
 use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
 use arrow::json::writer::LineDelimited;
-use arrow::json::{self, ReaderBuilder};
 use arrow::record_batch::RecordBatch;
 use arrow_schema::Schema;
 use async_trait::async_trait;
 use common_runtime;
-use datafusion::datasource::physical_plan::{FileMeta, FileOpenFuture, FileOpener};
-use datafusion::error::{DataFusionError, Result as DataFusionResult};
 use datafusion::physical_plan::SendableRecordBatchStream;
 use object_store::ObjectStore;
 use snafu::ResultExt;
 use tokio_util::compat::FuturesAsyncReadCompatExt;
 use tokio_util::io::SyncIoBridge;

-use super::stream_to_file;
 use crate::buffered_writer::DfRecordBatchEncoder;
 use crate::compression::CompressionType;
 use crate::error::{self, Result};
-use crate::file_format::{self, open_with_decoder, FileFormat};
+use crate::file_format::{self, stream_to_file, FileFormat};
 use crate::share_buffer::SharedBuffer;

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -114,47 +109,6 @@ impl FileFormat for JsonFormat {
    }
 }

-#[derive(Debug, Clone)]
-pub struct JsonOpener {
-    batch_size: usize,
-    projected_schema: SchemaRef,
-    object_store: Arc<ObjectStore>,
-    compression_type: CompressionType,
-}
-
-impl JsonOpener {
-    /// Return a new [`JsonOpener`]. Any fields not present in `projected_schema` will be ignored.
-    pub fn new(
-        batch_size: usize,
-        projected_schema: SchemaRef,
-        object_store: ObjectStore,
-        compression_type: CompressionType,
-    ) -> Self {
-        Self {
-            batch_size,
-            projected_schema,
-            object_store: Arc::new(object_store),
-            compression_type,
-        }
-    }
-}
-
-impl FileOpener for JsonOpener {
-    fn open(&self, meta: FileMeta) -> DataFusionResult<FileOpenFuture> {
-        open_with_decoder(
-            self.object_store.clone(),
-            meta.location().to_string(),
-            self.compression_type,
-            || {
-                ReaderBuilder::new(self.projected_schema.clone())
-                    .with_batch_size(self.batch_size)
-                    .build_decoder()
-                    .map_err(DataFusionError::from)
-            },
-        )
-    }
-}
-
 pub async fn stream_to_json(
    stream: SendableRecordBatchStream,
    store: ObjectStore,
--- a/src/common/datasource/src/file_format/tests.rs
+++ b/src/common/datasource/src/file_format/tests.rs
@@ -19,7 +19,10 @@ use std::vec;

 use common_test_util::find_workspace_path;
 use datafusion::assert_batches_eq;
-use datafusion::datasource::physical_plan::{FileOpener, FileScanConfig, FileStream, ParquetExec};
+use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
+use datafusion::datasource::physical_plan::{
+    CsvConfig, CsvOpener, FileOpener, FileScanConfig, FileStream, JsonOpener, ParquetExec,
+};
 use datafusion::execution::context::TaskContext;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion::physical_plan::ExecutionPlan;
@@ -27,14 +30,11 @@ use datafusion::prelude::SessionContext;
 use futures::StreamExt;

 use super::FORMAT_TYPE;
-use crate::compression::CompressionType;
-use crate::error;
-use crate::file_format::csv::{CsvConfigBuilder, CsvOpener};
-use crate::file_format::json::JsonOpener;
 use crate::file_format::orc::{OrcFormat, OrcOpener};
 use crate::file_format::parquet::DefaultParquetFileReaderFactory;
 use crate::file_format::{FileFormat, Format};
-use crate::test_util::{self, scan_config, test_basic_schema, test_store};
+use crate::test_util::{scan_config, test_basic_schema, test_store};
+use crate::{error, test_util};

 struct Test<'a, T: FileOpener> {
    config: FileScanConfig,
@@ -62,15 +62,18 @@ impl<T: FileOpener> Test<'_, T> {
 #[tokio::test]
 async fn test_json_opener() {
    let store = test_store("/");
+    let store = Arc::new(object_store_opendal::OpendalStore::new(store));

    let schema = test_basic_schema();

-    let json_opener = JsonOpener::new(
-        100,
-        schema.clone(),
-        store.clone(),
-        CompressionType::Uncompressed,
-    );
+    let json_opener = || {
+        JsonOpener::new(
+            test_util::TEST_BATCH_SIZE,
+            schema.clone(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+        )
+    };

    let path = &find_workspace_path("/src/common/datasource/tests/json/basic.json")
        .display()
@@ -78,7 +81,7 @@ async fn test_json_opener() {
    let tests = [
        Test {
            config: scan_config(schema.clone(), None, path),
-            opener: json_opener.clone(),
+            opener: json_opener(),
            expected: vec![
                "+-----+-------+",
                "| num | str   |",
@@ -91,7 +94,7 @@ async fn test_json_opener() {
        },
        Test {
            config: scan_config(schema.clone(), Some(1), path),
-            opener: json_opener.clone(),
+            opener: json_opener(),
            expected: vec![
                "+-----+------+",
                "| num | str  |",
@@ -110,23 +113,30 @@ async fn test_json_opener() {
 #[tokio::test]
 async fn test_csv_opener() {
    let store = test_store("/");
+    let store = Arc::new(object_store_opendal::OpendalStore::new(store));

    let schema = test_basic_schema();
    let path = &find_workspace_path("/src/common/datasource/tests/csv/basic.csv")
        .display()
        .to_string();
-    let csv_conf = CsvConfigBuilder::default()
-        .batch_size(test_util::TEST_BATCH_SIZE)
-        .file_schema(schema.clone())
-        .build()
-        .unwrap();
+    let csv_config = Arc::new(CsvConfig::new(
+        test_util::TEST_BATCH_SIZE,
+        schema.clone(),
+        None,
+        true,
+        b',',
+        b'"',
+        None,
+        store,
+        None,
+    ));

-    let csv_opener = CsvOpener::new(csv_conf, store, CompressionType::Uncompressed);
+    let csv_opener = || CsvOpener::new(csv_config.clone(), FileCompressionType::UNCOMPRESSED);

    let tests = [
        Test {
            config: scan_config(schema.clone(), None, path),
-            opener: csv_opener.clone(),
+            opener: csv_opener(),
            expected: vec![
                "+-----+-------+",
                "| num | str   |",
@@ -139,7 +149,7 @@ async fn test_csv_opener() {
        },
        Test {
            config: scan_config(schema.clone(), Some(1), path),
-            opener: csv_opener.clone(),
+            opener: csv_opener(),
            expected: vec![
                "+-----+------+",
                "| num | str  |",
--- a/src/common/datasource/src/test_util.rs
+++ b/src/common/datasource/src/test_util.rs
@@ -16,17 +16,19 @@ use std::sync::Arc;

 use arrow_schema::{DataType, Field, Schema, SchemaRef};
 use common_test_util::temp_dir::{create_temp_dir, TempDir};
-use datafusion::common::Statistics;
+use datafusion::common::{Constraints, Statistics};
+use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
-use datafusion::datasource::physical_plan::{FileScanConfig, FileStream};
+use datafusion::datasource::physical_plan::{
+    CsvConfig, CsvOpener, FileScanConfig, FileStream, JsonOpener,
+};
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use object_store::services::Fs;
 use object_store::ObjectStore;

-use crate::compression::CompressionType;
-use crate::file_format::csv::{stream_to_csv, CsvConfigBuilder, CsvOpener};
-use crate::file_format::json::{stream_to_json, JsonOpener};
+use crate::file_format::csv::stream_to_csv;
+use crate::file_format::json::stream_to_json;
 use crate::test_util;

 pub const TEST_BATCH_SIZE: usize = 100;
@@ -74,6 +76,7 @@ pub fn scan_config(file_schema: SchemaRef, limit: Option<usize>, filename: &str)
        object_store_url: ObjectStoreUrl::parse("empty://").unwrap(), // won't be used
        file_schema,
        file_groups: vec![vec![PartitionedFile::new(filename.to_string(), 10)]],
+        constraints: Constraints::empty(),
        statistics,
        projection: None,
        limit,
@@ -90,8 +93,8 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
    let json_opener = JsonOpener::new(
        test_util::TEST_BATCH_SIZE,
        schema.clone(),
-        store.clone(),
-        CompressionType::Uncompressed,
+        FileCompressionType::UNCOMPRESSED,
+        Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
    );

    let size = store.read(origin_path).await.unwrap().len();
@@ -124,13 +127,19 @@ pub async fn setup_stream_to_csv_test(origin_path: &str, threshold: impl Fn(usiz

    let schema = test_basic_schema();

-    let csv_conf = CsvConfigBuilder::default()
-        .batch_size(test_util::TEST_BATCH_SIZE)
-        .file_schema(schema.clone())
-        .build()
-        .unwrap();
+    let csv_config = Arc::new(CsvConfig::new(
+        TEST_BATCH_SIZE,
+        schema.clone(),
+        None,
+        true,
+        b',',
+        b'"',
+        None,
+        Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
+        None,
+    ));

-    let csv_opener = CsvOpener::new(csv_conf, store.clone(), CompressionType::Uncompressed);
+    let csv_opener = CsvOpener::new(csv_config, FileCompressionType::UNCOMPRESSED);

    let size = store.read(origin_path).await.unwrap().len();

--- a/src/common/error/Cargo.toml
+++ b/src/common/error/Cargo.toml
@@ -12,3 +12,6 @@ http.workspace = true
 snafu.workspace = true
 strum.workspace = true
 tonic.workspace = true
+
+[dev-dependencies]
+common-macro.workspace = true
--- a/src/common/error/src/ext.rs
+++ b/src/common/error/src/ext.rs
@@ -42,7 +42,7 @@ pub trait ErrorExt: StackError {
                if let Some(external_error) = error.source() {
                    let external_root = external_error.sources().last().unwrap();

-                    if error.to_string().is_empty() {
+                    if error.transparent() {
                        format!("{external_root}")
                    } else {
                        format!("{error}: {external_root}")
@@ -86,6 +86,14 @@ pub trait StackError: std::error::Error {
        }
        result
    }
+
+    /// Indicates whether this error is "transparent", that it delegates its "display" and "source"
+    /// to the underlying error. Could be useful when you are just wrapping some external error,
+    /// **AND** can not or would not provide meaningful contextual info. For example, the
+    /// `DataFusionError`.
+    fn transparent(&self) -> bool {
+        false
+    }
 }

 impl<T: ?Sized + StackError> StackError for Arc<T> {
--- a/src/common/error/src/status_code.rs
+++ b/src/common/error/src/status_code.rs
@@ -34,12 +34,14 @@ pub enum StatusCode {
    Internal = 1003,
    /// Invalid arguments.
    InvalidArguments = 1004,
-    /// The task is cancelled.
+    /// The task is cancelled (typically caller-side).
    Cancelled = 1005,
    /// Illegal state, can be exposed to users.
    IllegalState = 1006,
    /// Caused by some error originated from external system.
    External = 1007,
+    /// The request is deadline exceeded (typically server-side).
+    DeadlineExceeded = 1008,
    // ====== End of common status code ================

    // ====== Begin of SQL related status code =========
@@ -142,6 +144,7 @@ impl StatusCode {
            | StatusCode::Unexpected
            | StatusCode::InvalidArguments
            | StatusCode::Cancelled
+            | StatusCode::DeadlineExceeded
            | StatusCode::InvalidSyntax
            | StatusCode::DatabaseAlreadyExists
            | StatusCode::PlanQuery
@@ -177,6 +180,7 @@ impl StatusCode {
            | StatusCode::Unexpected
            | StatusCode::Internal
            | StatusCode::Cancelled
+            | StatusCode::DeadlineExceeded
            | StatusCode::IllegalState
            | StatusCode::EngineExecuteQuery
            | StatusCode::StorageUnavailable
@@ -272,6 +276,7 @@ pub fn status_to_tonic_code(status_code: StatusCode) -> Code {
            Code::InvalidArgument
        }
        StatusCode::Cancelled => Code::Cancelled,
+        StatusCode::DeadlineExceeded => Code::DeadlineExceeded,
        StatusCode::TableAlreadyExists
        | StatusCode::TableColumnExists
        | StatusCode::RegionAlreadyExists
@@ -299,6 +304,15 @@ pub fn status_to_tonic_code(status_code: StatusCode) -> Code {
    }
 }

+/// Converts tonic [Code] to [StatusCode].
+pub fn convert_tonic_code_to_status_code(code: Code) -> StatusCode {
+    match code {
+        Code::Cancelled => StatusCode::Cancelled,
+        Code::DeadlineExceeded => StatusCode::DeadlineExceeded,
+        _ => StatusCode::Internal,
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use strum::IntoEnumIterator;
--- a/src/common/error/tests/ext.rs
+++ b/src/common/error/tests/ext.rs
@@ -0,0 +1,115 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::{ErrorExt, PlainError, StackError};
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, ResultExt, Snafu};
+
+#[derive(Snafu)]
+#[stack_trace_debug]
+enum MyError {
+    #[snafu(display(r#"A normal error with "display" attribute, message "{}""#, message))]
+    Normal {
+        message: String,
+        #[snafu(source)]
+        error: PlainError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(transparent)]
+    Transparent {
+        #[snafu(source)]
+        error: PlainError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+}
+
+impl ErrorExt for MyError {
+    fn status_code(&self) -> StatusCode {
+        StatusCode::Unexpected
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+fn normal_error() -> Result<(), MyError> {
+    let plain_error = PlainError::new("<root cause>".to_string(), StatusCode::Unexpected);
+    Err(plain_error).context(NormalSnafu { message: "blabla" })
+}
+
+fn transparent_error() -> Result<(), MyError> {
+    let plain_error = PlainError::new("<root cause>".to_string(), StatusCode::Unexpected);
+    Err(plain_error)?
+}
+
+#[test]
+fn test_output_msg() {
+    let result = normal_error();
+    assert_eq!(
+        result.unwrap_err().output_msg(),
+        r#"A normal error with "display" attribute, message "blabla": <root cause>"#
+    );
+
+    let result = transparent_error();
+    assert_eq!(result.unwrap_err().output_msg(), "<root cause>");
+}
+
+#[test]
+fn test_to_string() {
+    let result = normal_error();
+    assert_eq!(
+        result.unwrap_err().to_string(),
+        r#"A normal error with "display" attribute, message "blabla""#
+    );
+
+    let result = transparent_error();
+    assert_eq!(result.unwrap_err().to_string(), "<root cause>");
+}
+
+#[test]
+fn test_debug_format() {
+    let result = normal_error();
+    let debug_output = format!("{:?}", result.unwrap_err());
+    let normalized_output = debug_output.replace('\\', "/");
+    assert_eq!(
+        normalized_output,
+        r#"0: A normal error with "display" attribute, message "blabla", at src/common/error/tests/ext.rs:55:22
+1: PlainError { msg: "<root cause>", status_code: Unexpected }"#
+    );
+
+    let result = transparent_error();
+    let debug_output = format!("{:?}", result.unwrap_err());
+    let normalized_output = debug_output.replace('\\', "/");
+    assert_eq!(
+        normalized_output,
+        r#"0: <transparent>, at src/common/error/tests/ext.rs:60:5
+1: PlainError { msg: "<root cause>", status_code: Unexpected }"#
+    );
+}
+
+#[test]
+fn test_transparent_flag() {
+    let result = normal_error();
+    assert!(!result.unwrap_err().transparent());
+
+    let result = transparent_error();
+    assert!(result.unwrap_err().transparent());
+}
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 workspace = true

 [features]
+testing = []
 default = ["geo"]
 geo = ["geohash", "h3o", "s2", "wkt", "geo-types", "dep:geo"]

@@ -17,6 +18,7 @@ api.workspace = true
 arc-swap = "1.0"
 async-trait.workspace = true
 bincode = "1.3"
+catalog.workspace = true
 chrono.workspace = true
 common-base.workspace = true
 common-catalog.workspace = true
--- a/src/common/function/src/admin.rs
+++ b/src/common/function/src/admin.rs
@@ -15,6 +15,7 @@
 mod add_region_follower;
 mod flush_compact_region;
 mod flush_compact_table;
+mod metadata_snaphost;
 mod migrate_region;
 mod remove_region_follower;

@@ -23,6 +24,7 @@ use std::sync::Arc;
 use add_region_follower::AddRegionFollowerFunction;
 use flush_compact_region::{CompactRegionFunction, FlushRegionFunction};
 use flush_compact_table::{CompactTableFunction, FlushTableFunction};
+use metadata_snaphost::{DumpMetadataFunction, RestoreMetadataFunction};
 use migrate_region::MigrateRegionFunction;
 use remove_region_follower::RemoveRegionFollowerFunction;

@@ -43,5 +45,7 @@ impl AdminFunction {
        registry.register_async(Arc::new(FlushTableFunction));
        registry.register_async(Arc::new(CompactTableFunction));
        registry.register_async(Arc::new(FlushFlowFunction));
+        registry.register_async(Arc::new(DumpMetadataFunction));
+        registry.register_async(Arc::new(RestoreMetadataFunction));
    }
 }
--- a/src/common/function/src/admin/metadata_snaphost.rs
+++ b/src/common/function/src/admin/metadata_snaphost.rs
@@ -0,0 +1,56 @@
+use common_macro::admin_fn;
+use common_query::error::{MissingMetadataSnapshotHandlerSnafu, Result};
+use common_query::prelude::{Signature, Volatility};
+use datatypes::prelude::*;
+use session::context::QueryContextRef;
+
+use crate::handlers::MetadataSnapshotHandlerRef;
+
+const METADATA_DIR: &str = "/snaphost/";
+const METADATA_FILE_NAME: &str = "dump_metadata";
+const METADATA_FILE_EXTENSION: &str = "metadata.fb";
+
+#[admin_fn(
+    name = DumpMetadataFunction,
+    display_name = dump_metadata,
+    sig_fn = dump_signature,
+    ret = string
+)]
+pub(crate) async fn dump_metadata(
+    metadata_snapshot_handler: &MetadataSnapshotHandlerRef,
+    _query_ctx: &QueryContextRef,
+    _params: &[ValueRef<'_>],
+) -> Result<Value> {
+    let filename = metadata_snapshot_handler
+        .dump(METADATA_DIR, METADATA_FILE_NAME)
+        .await?;
+    Ok(Value::from(filename))
+}
+
+fn dump_signature() -> Signature {
+    Signature::uniform(0, vec![], Volatility::Immutable)
+}
+
+#[admin_fn(
+    name = RestoreMetadataFunction,
+    display_name = restore_metadata,
+    sig_fn = restore_signature,
+    ret = uint64,
+)]
+pub(crate) async fn restore_metadata(
+    metadata_snapshot_handler: &MetadataSnapshotHandlerRef,
+    _query_ctx: &QueryContextRef,
+    _params: &[ValueRef<'_>],
+) -> Result<Value> {
+    let num_keyvalues = metadata_snapshot_handler
+        .restore(
+            METADATA_DIR,
+            &format!("{METADATA_FILE_NAME}.{METADATA_FILE_EXTENSION}"),
+        )
+        .await?;
+    Ok(Value::from(num_keyvalues))
+}
+
+fn restore_signature() -> Signature {
+    Signature::uniform(0, vec![], Volatility::Immutable)
+}
--- a/src/common/function/src/admin/migrate_region.rs
+++ b/src/common/function/src/admin/migrate_region.rs
@@ -25,12 +25,13 @@ use session::context::QueryContextRef;
 use crate::handlers::ProcedureServiceHandlerRef;
 use crate::helper::cast_u64;

-const DEFAULT_TIMEOUT_SECS: u64 = 30;
+/// The default timeout for migrate region procedure.
+const DEFAULT_TIMEOUT_SECS: u64 = 300;

 /// A function to migrate a region from source peer to target peer.
 /// Returns the submitted procedure id if success. Only available in cluster mode.
 ///
-/// - `migrate_region(region_id, from_peer, to_peer)`, with timeout(30 seconds).
+/// - `migrate_region(region_id, from_peer, to_peer)`, with timeout(300 seconds).
 /// - `migrate_region(region_id, from_peer, to_peer, timeout(secs))`.
 ///
 /// The parameters:
--- a/src/common/function/src/function.rs
+++ b/src/common/function/src/function.rs
@@ -32,7 +32,7 @@ pub struct FunctionContext {

 impl FunctionContext {
    /// Create a mock [`FunctionContext`] for test.
-    #[cfg(test)]
+    #[cfg(any(test, feature = "testing"))]
    pub fn mock() -> Self {
        Self {
            query_ctx: QueryContextBuilder::default().build().into(),
--- a/src/common/function/src/handlers.rs
+++ b/src/common/function/src/handlers.rs
@@ -15,6 +15,7 @@
 use std::sync::Arc;

 use async_trait::async_trait;
+use catalog::CatalogManagerRef;
 use common_base::AffectedRows;
 use common_meta::rpc::procedure::{
    AddRegionFollowerRequest, MigrateRegionRequest, ProcedureStateResponse,
@@ -72,6 +73,9 @@ pub trait ProcedureServiceHandler: Send + Sync {

    /// Remove a region follower from a region.
    async fn remove_region_follower(&self, request: RemoveRegionFollowerRequest) -> Result<()>;
+
+    /// Get the catalog manager
+    fn catalog_manager(&self) -> &CatalogManagerRef;
 }

 /// This flow service handler is only use for flush flow for now.
@@ -85,8 +89,18 @@ pub trait FlowServiceHandler: Send + Sync {
    ) -> Result<api::v1::flow::FlowResponse>;
 }

+/// This metadata snapshot handler is only use for dump and restore metadata for now.
+#[async_trait]
+pub trait MetadataSnapshotHandler: Send + Sync {
+    async fn dump(&self, path: &str, filename: &str) -> Result<String>;
+
+    async fn restore(&self, path: &str, filename: &str) -> Result<u64>;
+}
+
 pub type TableMutationHandlerRef = Arc<dyn TableMutationHandler>;

 pub type ProcedureServiceHandlerRef = Arc<dyn ProcedureServiceHandler>;

 pub type FlowServiceHandlerRef = Arc<dyn FlowServiceHandler>;
+
+pub type MetadataSnapshotHandlerRef = Arc<dyn MetadataSnapshotHandler>;
--- a/src/common/function/src/scalars/geo/encoding.rs
+++ b/src/common/function/src/scalars/geo/encoding.rs
@@ -27,7 +27,7 @@ use datatypes::value::{ListValue, Value};
 use datatypes::vectors::VectorRef;
 use snafu::{ensure, ResultExt};

-use super::helpers::{ensure_columns_len, ensure_columns_n};
+use crate::scalars::geo::helpers::{ensure_columns_len, ensure_columns_n};

 /// Accumulator of lat, lng, timestamp tuples
 #[derive(Debug)]
--- a/src/common/function/src/scalars/geo/h3.rs
+++ b/src/common/function/src/scalars/geo/h3.rs
@@ -31,8 +31,8 @@ use h3o::{CellIndex, LatLng, Resolution};
 use once_cell::sync::Lazy;
 use snafu::ResultExt;

-use super::helpers::{ensure_and_coerce, ensure_columns_len, ensure_columns_n};
 use crate::function::{Function, FunctionContext};
+use crate::scalars::geo::helpers::{ensure_and_coerce, ensure_columns_len, ensure_columns_n};

 static CELL_TYPES: Lazy<Vec<ConcreteDataType>> = Lazy::new(|| {
    vec![
--- a/src/common/function/src/scalars/geo/measure.rs
+++ b/src/common/function/src/scalars/geo/measure.rs
@@ -26,9 +26,9 @@ use geo::{Area, Distance, Haversine};
 use geo_types::Geometry;
 use snafu::ResultExt;

-use super::helpers::{ensure_columns_len, ensure_columns_n};
-use super::wkt::parse_wkt;
 use crate::function::{Function, FunctionContext};
+use crate::scalars::geo::helpers::{ensure_columns_len, ensure_columns_n};
+use crate::scalars::geo::wkt::parse_wkt;

 /// Return WGS84(SRID: 4326) euclidean distance between two geometry object, in degree
 #[derive(Clone, Debug, Default, Display)]
--- a/src/common/function/src/scalars/geo/relation.rs
+++ b/src/common/function/src/scalars/geo/relation.rs
@@ -23,9 +23,9 @@ use geo::algorithm::contains::Contains;
 use geo::algorithm::intersects::Intersects;
 use geo::algorithm::within::Within;

-use super::helpers::{ensure_columns_len, ensure_columns_n};
-use super::wkt::parse_wkt;
 use crate::function::{Function, FunctionContext};
+use crate::scalars::geo::helpers::{ensure_columns_len, ensure_columns_n};
+use crate::scalars::geo::wkt::parse_wkt;

 /// Test if spatial relationship: contains
 #[derive(Clone, Debug, Default, Display)]
--- a/src/common/function/src/scalars/geo/wkt.rs
+++ b/src/common/function/src/scalars/geo/wkt.rs
@@ -26,8 +26,8 @@ use once_cell::sync::Lazy;
 use snafu::ResultExt;
 use wkt::{ToWkt, TryFromWkt};

-use super::helpers::{ensure_columns_len, ensure_columns_n};
 use crate::function::{Function, FunctionContext};
+use crate::scalars::geo::helpers::{ensure_columns_len, ensure_columns_n};

 static COORDINATE_TYPES: Lazy<Vec<ConcreteDataType>> = Lazy::new(|| {
    vec![
--- a/src/common/function/src/scalars/udf.rs
+++ b/src/common/function/src/scalars/udf.rs
@@ -163,7 +163,7 @@ mod tests {
        ];

        let args = ScalarFunctionArgs {
-            args: &args,
+            args,
            number_rows: 4,
            return_type: &ConcreteDataType::boolean_datatype().as_arrow_type(),
        };
--- a/src/common/function/src/state.rs
+++ b/src/common/function/src/state.rs
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use crate::handlers::{FlowServiceHandlerRef, ProcedureServiceHandlerRef, TableMutationHandlerRef};
+use crate::handlers::{
+    FlowServiceHandlerRef, MetadataSnapshotHandlerRef, ProcedureServiceHandlerRef,
+    TableMutationHandlerRef,
+};

 /// Shared state for SQL functions.
 /// The handlers in state may be `None` in cli command-line or test cases.
@@ -24,16 +27,19 @@ pub struct FunctionState {
    pub procedure_service_handler: Option<ProcedureServiceHandlerRef>,
    // The flownode handler
    pub flow_service_handler: Option<FlowServiceHandlerRef>,
+    // The metadata snapshot handler
+    pub metadata_snapshot_handler: Option<MetadataSnapshotHandlerRef>,
 }

 impl FunctionState {
    /// Create a mock [`FunctionState`] for test.
-    #[cfg(test)]
+    #[cfg(any(test, feature = "testing"))]
    pub fn mock() -> Self {
        use std::sync::Arc;

        use api::v1::meta::ProcedureStatus;
        use async_trait::async_trait;
+        use catalog::CatalogManagerRef;
        use common_base::AffectedRows;
        use common_meta::rpc::procedure::{
            AddRegionFollowerRequest, MigrateRegionRequest, ProcedureStateResponse,
@@ -47,10 +53,14 @@ impl FunctionState {
            CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest,
        };

-        use crate::handlers::{FlowServiceHandler, ProcedureServiceHandler, TableMutationHandler};
+        use crate::handlers::{
+            FlowServiceHandler, MetadataSnapshotHandler, ProcedureServiceHandler,
+            TableMutationHandler,
+        };
        struct MockProcedureServiceHandler;
        struct MockTableMutationHandler;
        struct MockFlowServiceHandler;
+        struct MockMetadataServiceHandler;
        const ROWS: usize = 42;

        #[async_trait]
@@ -80,6 +90,10 @@ impl FunctionState {
            ) -> Result<()> {
                Ok(())
            }
+
+            fn catalog_manager(&self) -> &CatalogManagerRef {
+                unimplemented!()
+            }
        }

        #[async_trait]
@@ -145,10 +159,22 @@ impl FunctionState {
            }
        }

+        #[async_trait]
+        impl MetadataSnapshotHandler for MockMetadataServiceHandler {
+            async fn dump(&self, _path: &str, _filename: &str) -> Result<String> {
+                Ok("test_filename".to_string())
+            }
+
+            async fn restore(&self, _path: &str, _filename: &str) -> Result<u64> {
+                Ok(100)
+            }
+        }
+
        Self {
            table_mutation_handler: Some(Arc::new(MockTableMutationHandler)),
            procedure_service_handler: Some(Arc::new(MockProcedureServiceHandler)),
            flow_service_handler: Some(Arc::new(MockFlowServiceHandler)),
+            metadata_snapshot_handler: Some(Arc::new(MockMetadataServiceHandler)),
        }
    }
 }
--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -23,8 +23,11 @@ flatbuffers = "24"
 hyper.workspace = true
 lazy_static.workspace = true
 prost.workspace = true
+serde.workspace = true
+serde_json.workspace = true
 snafu.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tonic.workspace = true
 tower.workspace = true

--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -22,6 +22,7 @@ use dashmap::mapref::entry::Entry;
 use dashmap::DashMap;
 use lazy_static::lazy_static;
 use snafu::{OptionExt, ResultExt};
+use tokio_util::sync::CancellationToken;
 use tonic::transport::{
    Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri,
 };
@@ -39,18 +40,48 @@ lazy_static! {
    static ref ID: AtomicU64 = AtomicU64::new(0);
 }

-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct ChannelManager {
+    inner: Arc<Inner>,
+}
+
+#[derive(Debug)]
+struct Inner {
    id: u64,
    config: ChannelConfig,
    client_tls_config: Option<ClientTlsConfig>,
    pool: Arc<Pool>,
-    channel_recycle_started: Arc<AtomicBool>,
+    channel_recycle_started: AtomicBool,
+    cancel: CancellationToken,
 }

-impl Default for ChannelManager {
+impl Default for Inner {
    fn default() -> Self {
-        ChannelManager::with_config(ChannelConfig::default())
+        Self::with_config(ChannelConfig::default())
+    }
+}
+
+impl Drop for Inner {
+    fn drop(&mut self) {
+        // Cancel the channel recycle task.
+        self.cancel.cancel();
+    }
+}
+
+impl Inner {
+    fn with_config(config: ChannelConfig) -> Self {
+        let id = ID.fetch_add(1, Ordering::Relaxed);
+        let pool = Arc::new(Pool::default());
+        let cancel = CancellationToken::new();
+
+        Self {
+            id,
+            config,
+            client_tls_config: None,
+            pool,
+            channel_recycle_started: AtomicBool::new(false),
+            cancel,
+        }
    }
 }

@@ -60,19 +91,14 @@ impl ChannelManager {
    }

    pub fn with_config(config: ChannelConfig) -> Self {
-        let id = ID.fetch_add(1, Ordering::Relaxed);
-        let pool = Arc::new(Pool::default());
+        let inner = Inner::with_config(config);
        Self {
-            id,
-            config,
-            client_tls_config: None,
-            pool,
-            channel_recycle_started: Arc::new(AtomicBool::new(false)),
+            inner: Arc::new(inner),
        }
    }

    pub fn with_tls_config(config: ChannelConfig) -> Result<Self> {
-        let mut cm = Self::with_config(config.clone());
+        let mut inner = Inner::with_config(config.clone());

        // setup tls
        let path_config = config.client_tls.context(InvalidTlsConfigSnafu {
@@ -88,17 +114,23 @@ impl ChannelManager {
            .context(InvalidConfigFilePathSnafu)?;
        let client_identity = Identity::from_pem(client_cert, client_key);

-        cm.client_tls_config = Some(
+        inner.client_tls_config = Some(
            ClientTlsConfig::new()
                .ca_certificate(server_root_ca_cert)
                .identity(client_identity),
        );

-        Ok(cm)
+        Ok(Self {
+            inner: Arc::new(inner),
+        })
    }

    pub fn config(&self) -> &ChannelConfig {
-        &self.config
+        &self.inner.config
+    }
+
+    fn pool(&self) -> &Arc<Pool> {
+        &self.inner.pool
    }

    pub fn get(&self, addr: impl AsRef<str>) -> Result<InnerChannel> {
@@ -106,12 +138,12 @@ impl ChannelManager {

        let addr = addr.as_ref();
        // It will acquire the read lock.
-        if let Some(inner_ch) = self.pool.get(addr) {
+        if let Some(inner_ch) = self.pool().get(addr) {
            return Ok(inner_ch);
        }

        // It will acquire the write lock.
-        let entry = match self.pool.entry(addr.to_string()) {
+        let entry = match self.pool().entry(addr.to_string()) {
            Entry::Occupied(entry) => {
                entry.get().increase_access();
                entry.into_ref()
@@ -150,7 +182,7 @@ impl ChannelManager {
            access: AtomicUsize::new(1),
            use_default_connector: false,
        };
-        self.pool.put(addr, channel);
+        self.pool().put(addr, channel);

        Ok(inner_channel)
    }
@@ -159,11 +191,11 @@ impl ChannelManager {
    where
        F: FnMut(&String, &mut Channel) -> bool,
    {
-        self.pool.retain_channel(f);
+        self.pool().retain_channel(f);
    }

    fn build_endpoint(&self, addr: &str) -> Result<Endpoint> {
-        let http_prefix = if self.client_tls_config.is_some() {
+        let http_prefix = if self.inner.client_tls_config.is_some() {
            "https"
        } else {
            "http"
@@ -172,51 +204,52 @@ impl ChannelManager {
        let mut endpoint =
            Endpoint::new(format!("{http_prefix}://{addr}")).context(CreateChannelSnafu)?;

-        if let Some(dur) = self.config.timeout {
+        if let Some(dur) = self.config().timeout {
            endpoint = endpoint.timeout(dur);
        }
-        if let Some(dur) = self.config.connect_timeout {
+        if let Some(dur) = self.config().connect_timeout {
            endpoint = endpoint.connect_timeout(dur);
        }
-        if let Some(limit) = self.config.concurrency_limit {
+        if let Some(limit) = self.config().concurrency_limit {
            endpoint = endpoint.concurrency_limit(limit);
        }
-        if let Some((limit, dur)) = self.config.rate_limit {
+        if let Some((limit, dur)) = self.config().rate_limit {
            endpoint = endpoint.rate_limit(limit, dur);
        }
-        if let Some(size) = self.config.initial_stream_window_size {
+        if let Some(size) = self.config().initial_stream_window_size {
            endpoint = endpoint.initial_stream_window_size(size);
        }
-        if let Some(size) = self.config.initial_connection_window_size {
+        if let Some(size) = self.config().initial_connection_window_size {
            endpoint = endpoint.initial_connection_window_size(size);
        }
-        if let Some(dur) = self.config.http2_keep_alive_interval {
+        if let Some(dur) = self.config().http2_keep_alive_interval {
            endpoint = endpoint.http2_keep_alive_interval(dur);
        }
-        if let Some(dur) = self.config.http2_keep_alive_timeout {
+        if let Some(dur) = self.config().http2_keep_alive_timeout {
            endpoint = endpoint.keep_alive_timeout(dur);
        }
-        if let Some(enabled) = self.config.http2_keep_alive_while_idle {
+        if let Some(enabled) = self.config().http2_keep_alive_while_idle {
            endpoint = endpoint.keep_alive_while_idle(enabled);
        }
-        if let Some(enabled) = self.config.http2_adaptive_window {
+        if let Some(enabled) = self.config().http2_adaptive_window {
            endpoint = endpoint.http2_adaptive_window(enabled);
        }
-        if let Some(tls_config) = &self.client_tls_config {
+        if let Some(tls_config) = &self.inner.client_tls_config {
            endpoint = endpoint
                .tls_config(tls_config.clone())
                .context(CreateChannelSnafu)?;
        }

        endpoint = endpoint
-            .tcp_keepalive(self.config.tcp_keepalive)
-            .tcp_nodelay(self.config.tcp_nodelay);
+            .tcp_keepalive(self.config().tcp_keepalive)
+            .tcp_nodelay(self.config().tcp_nodelay);

        Ok(endpoint)
    }

    fn trigger_channel_recycling(&self) {
        if self
+            .inner
            .channel_recycle_started
            .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
            .is_err()
@@ -224,13 +257,15 @@ impl ChannelManager {
            return;
        }

-        let pool = self.pool.clone();
-        let _handle = common_runtime::spawn_global(async {
-            recycle_channel_in_loop(pool, RECYCLE_CHANNEL_INTERVAL_SECS).await;
+        let pool = self.pool().clone();
+        let cancel = self.inner.cancel.clone();
+        let id = self.inner.id;
+        let _handle = common_runtime::spawn_global(async move {
+            recycle_channel_in_loop(pool, id, cancel, RECYCLE_CHANNEL_INTERVAL_SECS).await;
        });
        info!(
            "ChannelManager: {}, channel recycle is started, running in the background!",
-            self.id
+            self.inner.id
        );
    }
 }
@@ -443,11 +478,23 @@ impl Pool {
    }
 }

-async fn recycle_channel_in_loop(pool: Arc<Pool>, interval_secs: u64) {
+async fn recycle_channel_in_loop(
+    pool: Arc<Pool>,
+    id: u64,
+    cancel: CancellationToken,
+    interval_secs: u64,
+) {
    let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));

    loop {
-        let _ = interval.tick().await;
+        tokio::select! {
+            _ = cancel.cancelled() => {
+                info!("Stop channel recycle, ChannelManager id: {}", id);
+                break;
+            },
+            _ = interval.tick() => {}
+        }
+
        pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
    }
 }
@@ -461,11 +508,7 @@ mod tests {
    #[should_panic]
    #[test]
    fn test_invalid_addr() {
-        let pool = Arc::new(Pool::default());
-        let mgr = ChannelManager {
-            pool,
-            ..Default::default()
-        };
+        let mgr = ChannelManager::default();
        let addr = "http://test";

        let _ = mgr.get(addr).unwrap();
@@ -475,7 +518,9 @@ mod tests {
    async fn test_access_count() {
        let mgr = ChannelManager::new();
        // Do not start recycle
-        mgr.channel_recycle_started.store(true, Ordering::Relaxed);
+        mgr.inner
+            .channel_recycle_started
+            .store(true, Ordering::Relaxed);
        let mgr = Arc::new(mgr);
        let addr = "test_uri";

@@ -493,12 +538,12 @@ mod tests {
            join.await.unwrap();
        }

-        assert_eq!(1000, mgr.pool.get_access(addr).unwrap());
+        assert_eq!(1000, mgr.pool().get_access(addr).unwrap());

-        mgr.pool
+        mgr.pool()
            .retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0);

-        assert_eq!(0, mgr.pool.get_access(addr).unwrap());
+        assert_eq!(0, mgr.pool().get_access(addr).unwrap());
    }

    #[test]
@@ -624,4 +669,49 @@ mod tests {
            true
        });
    }
+
+    #[tokio::test]
+    async fn test_pool_release_with_channel_recycle() {
+        let mgr = ChannelManager::new();
+
+        let pool_holder = mgr.pool().clone();
+
+        // start channel recycle task
+        let addr = "test_addr";
+        let _ = mgr.get(addr);
+
+        let mgr_clone_1 = mgr.clone();
+        let mgr_clone_2 = mgr.clone();
+        assert_eq!(3, Arc::strong_count(mgr.pool()));
+
+        drop(mgr_clone_1);
+        drop(mgr_clone_2);
+        assert_eq!(3, Arc::strong_count(mgr.pool()));
+
+        drop(mgr);
+
+        // wait for the channel recycle task to finish
+        tokio::time::sleep(Duration::from_millis(10)).await;
+
+        assert_eq!(1, Arc::strong_count(&pool_holder));
+    }
+
+    #[tokio::test]
+    async fn test_pool_release_without_channel_recycle() {
+        let mgr = ChannelManager::new();
+
+        let pool_holder = mgr.pool().clone();
+
+        let mgr_clone_1 = mgr.clone();
+        let mgr_clone_2 = mgr.clone();
+        assert_eq!(2, Arc::strong_count(mgr.pool()));
+
+        drop(mgr_clone_1);
+        drop(mgr_clone_2);
+        assert_eq!(2, Arc::strong_count(mgr.pool()));
+
+        drop(mgr);
+
+        assert_eq!(1, Arc::strong_count(&pool_holder));
+    }
 }
--- a/src/common/grpc/src/error.rs
+++ b/src/common/grpc/src/error.rs
@@ -97,6 +97,14 @@ pub enum Error {

    #[snafu(display("Not supported: {}", feat))]
    NotSupported { feat: String },
+
+    #[snafu(display("Failed to serde Json"))]
+    SerdeJson {
+        #[snafu(source)]
+        error: serde_json::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 impl ErrorExt for Error {
@@ -110,7 +118,8 @@ impl ErrorExt for Error {

            Error::CreateChannel { .. }
            | Error::Conversion { .. }
-            | Error::DecodeFlightData { .. } => StatusCode::Internal,
+            | Error::DecodeFlightData { .. }
+            | Error::SerdeJson { .. } => StatusCode::Internal,

            Error::CreateRecordBatch { source, .. } => source.status_code(),
            Error::ConvertArrowSchema { source, .. } => source.status_code(),
--- a/src/common/grpc/src/flight.rs
+++ b/src/common/grpc/src/flight.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+pub mod do_put;
+
 use std::collections::HashMap;
 use std::sync::Arc;

--- a/src/common/grpc/src/flight/do_put.rs
+++ b/src/common/grpc/src/flight/do_put.rs
@@ -0,0 +1,93 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use arrow_flight::PutResult;
+use common_base::AffectedRows;
+use serde::{Deserialize, Serialize};
+use snafu::ResultExt;
+
+use crate::error::{Error, SerdeJsonSnafu};
+
+/// The metadata for "DoPut" requests and responses.
+///
+/// Currently, there's only a "request_id", for coordinating requests and responses in the streams.
+/// Client can set a unique request id in this metadata, and the server will return the same id in
+/// the corresponding response. In doing so, a client can know how to do with its pending requests.
+#[derive(Serialize, Deserialize)]
+pub struct DoPutMetadata {
+    request_id: i64,
+}
+
+impl DoPutMetadata {
+    pub fn new(request_id: i64) -> Self {
+        Self { request_id }
+    }
+
+    pub fn request_id(&self) -> i64 {
+        self.request_id
+    }
+}
+
+/// The response in the "DoPut" returned stream.
+#[derive(Serialize, Deserialize)]
+pub struct DoPutResponse {
+    /// The same "request_id" in the request; see the [DoPutMetadata].
+    request_id: i64,
+    /// The successfully ingested rows number.
+    affected_rows: AffectedRows,
+}
+
+impl DoPutResponse {
+    pub fn new(request_id: i64, affected_rows: AffectedRows) -> Self {
+        Self {
+            request_id,
+            affected_rows,
+        }
+    }
+
+    pub fn request_id(&self) -> i64 {
+        self.request_id
+    }
+
+    pub fn affected_rows(&self) -> AffectedRows {
+        self.affected_rows
+    }
+}
+
+impl TryFrom<PutResult> for DoPutResponse {
+    type Error = Error;
+
+    fn try_from(value: PutResult) -> Result<Self, Self::Error> {
+        serde_json::from_slice(&value.app_metadata).context(SerdeJsonSnafu)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_serde_do_put_metadata() {
+        let serialized = r#"{"request_id":42}"#;
+        let metadata = serde_json::from_str::<DoPutMetadata>(serialized).unwrap();
+        assert_eq!(metadata.request_id(), 42);
+    }
+
+    #[test]
+    fn test_serde_do_put_response() {
+        let x = DoPutResponse::new(42, 88);
+        let serialized = serde_json::to_string(&x).unwrap();
+        assert_eq!(serialized, r#"{"request_id":42,"affected_rows":88}"#);
+    }
+}
--- a/src/common/macro/src/admin_fn.rs
+++ b/src/common/macro/src/admin_fn.rs
@@ -16,7 +16,7 @@ use proc_macro::TokenStream;
 use quote::quote;
 use syn::spanned::Spanned;
 use syn::{
-    parse_macro_input, Attribute, Ident, ItemFn, Signature, Type, TypePath, TypeReference,
+    parse_macro_input, Attribute, Ident, ItemFn, Path, Signature, Type, TypePath, TypeReference,
    Visibility,
 };

@@ -44,6 +44,7 @@ pub(crate) fn process_admin_fn(args: TokenStream, input: TokenStream) -> TokenSt
    let mut display_name: Option<Ident> = None;
    let mut sig_fn: Option<Ident> = None;
    let mut ret: Option<Ident> = None;
+    let mut user_path: Option<Path> = None;

    let parser = syn::meta::parser(|meta| {
        if meta.path.is_ident("name") {
@@ -58,6 +59,9 @@ pub(crate) fn process_admin_fn(args: TokenStream, input: TokenStream) -> TokenSt
        } else if meta.path.is_ident("ret") {
            ret = Some(meta.value()?.parse()?);
            Ok(())
+        } else if meta.path.is_ident("user_path") {
+            user_path = Some(meta.value()?.parse()?);
+            Ok(())
        } else {
            Err(meta.error("unsupported property"))
        }
@@ -66,6 +70,10 @@ pub(crate) fn process_admin_fn(args: TokenStream, input: TokenStream) -> TokenSt
    // extract arg map
    parse_macro_input!(args with parser);

+    if user_path.is_none() {
+        user_path = Some(syn::parse_str("crate").expect("failed to parse user path"));
+    }
+
    // decompose the fn block
    let compute_fn = parse_macro_input!(input as ItemFn);
    let ItemFn {
@@ -104,6 +112,7 @@ pub(crate) fn process_admin_fn(args: TokenStream, input: TokenStream) -> TokenSt
            ret.expect("ret required"),
            handler_type,
            display_name,
+            user_path.expect("user_path required"),
        );
        result.extend(struct_code);
    }
@@ -148,6 +157,7 @@ fn build_struct(
    ret: Ident,
    handler_type: &Ident,
    display_name_ident: Ident,
+    user_path: Path,
 ) -> TokenStream {
    let display_name = display_name_ident.to_string();
    let ret = Ident::new(&format!("{ret}_datatype"), ret.span());
@@ -169,6 +179,10 @@ fn build_struct(
            Ident::new("flow_service_handler", handler_type.span()),
            Ident::new("MissingFlowServiceHandlerSnafu", handler_type.span()),
        ),
+        "MetadataSnapshotHandlerRef" => (
+            Ident::new("metadata_snapshot_handler", handler_type.span()),
+            Ident::new("MissingMetadataSnapshotHandlerSnafu", handler_type.span()),
+        ),
        handler => ok!(error!(
            handler_type.span(),
            format!("Unknown handler type: {handler}")
@@ -188,7 +202,7 @@ fn build_struct(


        #[async_trait::async_trait]
-        impl crate::function::AsyncFunction for #name {
+        impl #user_path::function::AsyncFunction for #name {
            fn name(&self) -> &'static str {
                #display_name
            }
@@ -201,9 +215,9 @@ fn build_struct(
                #sig_fn()
            }

-            async fn eval(&self, func_ctx: crate::function::FunctionContext, columns: &[datatypes::vectors::VectorRef]) ->  common_query::error::Result<datatypes::vectors::VectorRef> {
+            async fn eval(&self, func_ctx: #user_path::function::FunctionContext, columns: &[datatypes::vectors::VectorRef]) ->  common_query::error::Result<datatypes::vectors::VectorRef> {
                // Ensure under the `greptime` catalog for security
-                crate::ensure_greptime!(func_ctx);
+                #user_path::ensure_greptime!(func_ctx);

                let columns_num = columns.len();
                let rows_num = if columns.is_empty() {
--- a/src/common/macro/src/lib.rs
+++ b/src/common/macro/src/lib.rs
@@ -91,8 +91,8 @@ pub fn range_fn(args: TokenStream, input: TokenStream) -> TokenStream {
 /// - `ret`: The return type of the generated SQL function, it will be transformed into `ConcreteDataType::{ret}_datatype()` result.
 /// - `display_name`: The display name of the generated SQL function.
 /// - `sig_fn`: the function to returns `Signature` of generated `Function`.
-///
-/// Note that this macro should only be used in `common-function` crate for now
+/// - `user_path`: Optional path to the trait and context (e.g., `crate`);
+///   defaults to `crate` if not provided.
 #[proc_macro_attribute]
 pub fn admin_fn(args: TokenStream, input: TokenStream) -> TokenStream {
    process_admin_fn(args, input)
--- a/src/common/macro/src/range_fn.rs
+++ b/src/common/macro/src/range_fn.rs
@@ -139,7 +139,7 @@ fn build_struct(
                    Self::name(),
                    Self::input_type(),
                    Self::return_type(),
-                    Volatility::Immutable,
+                    Volatility::Volatile,
                    Arc::new(Self::calc) as _,
                )
            }
--- a/src/common/macro/src/stack_trace_debug.rs
+++ b/src/common/macro/src/stack_trace_debug.rs
@@ -14,7 +14,7 @@

 //! implement `::common_error::ext::StackError`

-use proc_macro2::{Span, TokenStream as TokenStream2};
+use proc_macro2::{Literal, Span, TokenStream as TokenStream2, TokenTree};
 use quote::{quote, quote_spanned};
 use syn::spanned::Spanned;
 use syn::{parenthesized, Attribute, Ident, ItemEnum, Variant};
@@ -32,6 +32,7 @@ pub fn stack_trace_style_impl(args: TokenStream2, input: TokenStream2) -> TokenS
        variants.push(variant);
    }

+    let transparent_fn = build_transparent_fn(enum_name.clone(), &variants);
    let debug_fmt_fn = build_debug_fmt_impl(enum_name.clone(), variants.clone());
    let next_fn = build_next_impl(enum_name.clone(), variants);
    let debug_impl = build_debug_impl(enum_name.clone());
@@ -43,6 +44,7 @@ pub fn stack_trace_style_impl(args: TokenStream2, input: TokenStream2) -> TokenS
        impl ::common_error::ext::StackError for #enum_name {
            #debug_fmt_fn
            #next_fn
+            #transparent_fn
        }

        #debug_impl
@@ -115,6 +117,7 @@ struct ErrorVariant {
    has_source: bool,
    has_external_cause: bool,
    display: TokenStream2,
+    transparent: bool,
    span: Span,
    cfg_attr: Option<Attribute>,
 }
@@ -140,6 +143,7 @@ impl ErrorVariant {
        }

        let mut display = None;
+        let mut transparent = false;
        let mut cfg_attr = None;
        for attr in variant.attrs {
            if attr.path().is_ident("snafu") {
@@ -150,17 +154,29 @@ impl ErrorVariant {
                        let display_ts: TokenStream2 = content.parse()?;
                        display = Some(display_ts);
                        Ok(())
+                    } else if meta.path.is_ident("transparent") {
+                        display = Some(TokenStream2::from(TokenTree::Literal(Literal::string(
+                            "<transparent>",
+                        ))));
+                        transparent = true;
+                        Ok(())
                    } else {
                        Err(meta.error("unrecognized repr"))
                    }
                })
-                .expect("Each error should contains a display attribute");
+                .unwrap_or_else(|e| panic!("{e}"));
            }

            if attr.path().is_ident("cfg") {
                cfg_attr = Some(attr);
            }
        }
+        let display = display.unwrap_or_else(|| {
+            panic!(
+                r#"Error "{}" must be annotated with attribute "display" or "transparent"."#,
+                variant.ident,
+            )
+        });

        let field_ident = variant
            .fields
@@ -174,7 +190,8 @@ impl ErrorVariant {
            has_location,
            has_source,
            has_external_cause,
-            display: display.unwrap(),
+            display,
+            transparent,
            span,
            cfg_attr,
        }
@@ -275,4 +292,44 @@ impl ErrorVariant {
            }
        }
    }
+
+    fn build_transparent_match_arm(&self) -> TokenStream2 {
+        let cfg = if let Some(cfg) = &self.cfg_attr {
+            quote_spanned!(cfg.span() => #cfg)
+        } else {
+            quote! {}
+        };
+        let name = &self.name;
+        let fields = &self.fields;
+
+        if self.transparent {
+            quote_spanned! {
+                self.span => #cfg #[allow(unused_variables)] #name { #(#fields),* } => {
+                    true
+                },
+            }
+        } else {
+            quote_spanned! {
+                self.span => #cfg #[allow(unused_variables)] #name { #(#fields),* } =>{
+                    false
+                }
+            }
+        }
+    }
+}
+
+fn build_transparent_fn(enum_name: Ident, variants: &[ErrorVariant]) -> TokenStream2 {
+    let match_arms = variants
+        .iter()
+        .map(|v| v.build_transparent_match_arm())
+        .collect::<Vec<_>>();
+
+    quote! {
+        fn transparent(&self) -> bool {
+            use #enum_name::*;
+            match self {
+                #(#match_arms)*
+            }
+        }
+    }
 }
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -41,6 +41,7 @@ deadpool = { workspace = true, optional = true }
 deadpool-postgres = { workspace = true, optional = true }
 derive_builder.workspace = true
 etcd-client.workspace = true
+flexbuffers = "25.2"
 futures.workspace = true
 futures-util.workspace = true
 hex.workspace = true
@@ -48,6 +49,7 @@ humantime-serde.workspace = true
 itertools.workspace = true
 lazy_static.workspace = true
 moka.workspace = true
+object-store.workspace = true
 prometheus.workspace = true
 prost.workspace = true
 rand.workspace = true
@@ -70,6 +72,7 @@ typetag.workspace = true
 [dev-dependencies]
 chrono.workspace = true
 common-procedure = { workspace = true, features = ["testing"] }
+common-test-util.workspace = true
 common-wal = { workspace = true, features = ["testing"] }
 datatypes.workspace = true
 hyper = { version = "0.14", features = ["full"] }
--- a/src/common/meta/src/datanode.rs
+++ b/src/common/meta/src/datanode.rs
@@ -94,6 +94,13 @@ pub struct RegionStat {
    pub index_size: u64,
    /// The manifest infoof the region.
    pub region_manifest: RegionManifestInfo,
+    /// The latest entry id of topic used by data.
+    /// **Only used by remote WAL prune.**
+    pub data_topic_latest_entry_id: u64,
+    /// The latest entry id of topic used by metadata.
+    /// **Only used by remote WAL prune.**
+    /// In mito engine, this is the same as `data_topic_latest_entry_id`.
+    pub metadata_topic_latest_entry_id: u64,
 }

 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
@@ -142,6 +149,43 @@ impl Stat {
        self.wcus = self.region_stats.iter().map(|s| s.wcus).sum();
        self.region_num = self.region_stats.len() as u64;
    }
+
+    pub fn memory_size(&self) -> usize {
+        // timestamp_millis, rcus, wcus
+        std::mem::size_of::<i64>() * 3 +
+        // id, region_num, node_epoch
+        std::mem::size_of::<u64>() * 3 +
+        // addr
+        std::mem::size_of::<String>() + self.addr.capacity() +
+        // region_stats
+        self.region_stats.iter().map(|s| s.memory_size()).sum::<usize>()
+    }
+}
+
+impl RegionStat {
+    pub fn memory_size(&self) -> usize {
+        // role
+        std::mem::size_of::<RegionRole>() +
+        // id
+        std::mem::size_of::<RegionId>() +
+        // rcus, wcus, approximate_bytes, num_rows
+        std::mem::size_of::<i64>() * 4 +
+        // memtable_size, manifest_size, sst_size, index_size
+        std::mem::size_of::<u64>() * 4 +
+        // engine
+        std::mem::size_of::<String>() + self.engine.capacity() +
+        // region_manifest
+        self.region_manifest.memory_size()
+    }
+}
+
+impl RegionManifestInfo {
+    pub fn memory_size(&self) -> usize {
+        match self {
+            RegionManifestInfo::Mito { .. } => std::mem::size_of::<u64>() * 2,
+            RegionManifestInfo::Metric { .. } => std::mem::size_of::<u64>() * 4,
+        }
+    }
 }

 impl TryFrom<&HeartbeatRequest> for Stat {
@@ -227,6 +271,8 @@ impl From<&api::v1::meta::RegionStat> for RegionStat {
            sst_size: region_stat.sst_size,
            index_size: region_stat.index_size,
            region_manifest: region_stat.manifest.into(),
+            data_topic_latest_entry_id: region_stat.data_topic_latest_entry_id,
+            metadata_topic_latest_entry_id: region_stat.metadata_topic_latest_entry_id,
        }
    }
 }
--- a/src/common/meta/src/ddl/alter_database.rs
+++ b/src/common/meta/src/ddl/alter_database.rs
@@ -20,8 +20,8 @@ use serde::{Deserialize, Serialize};
 use snafu::{ensure, ResultExt};
 use strum::AsRefStr;

-use super::utils::handle_retry_error;
 use crate::cache_invalidator::Context;
+use crate::ddl::utils::handle_retry_error;
 use crate::ddl::DdlContext;
 use crate::error::{Result, SchemaNotFoundSnafu};
 use crate::instruction::CacheIdent;
--- a/src/common/meta/src/ddl/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables.rs
@@ -18,10 +18,12 @@ mod region_request;
 mod table_cache_keys;
 mod update_metadata;

+use api::region::RegionResponse;
 use async_trait::async_trait;
+use common_catalog::format_full_table_name;
 use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
 use common_procedure::{Context, LockKey, Procedure, Status};
-use common_telemetry::{info, warn};
+use common_telemetry::{error, info, warn};
 use futures_util::future;
 use serde::{Deserialize, Serialize};
 use snafu::{ensure, ResultExt};
@@ -30,7 +32,7 @@ use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY;
 use strum::AsRefStr;
 use table::metadata::TableId;

-use crate::ddl::utils::add_peer_context_if_needed;
+use crate::ddl::utils::{add_peer_context_if_needed, sync_follower_regions};
 use crate::ddl::DdlContext;
 use crate::error::{DecodeJsonSnafu, Error, MetadataCorruptionSnafu, Result};
 use crate::key::table_info::TableInfoValue;
@@ -39,7 +41,7 @@ use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
 use crate::metrics;
 use crate::rpc::ddl::AlterTableTask;
-use crate::rpc::router::find_leaders;
+use crate::rpc::router::{find_leaders, RegionRoute};

 pub struct AlterLogicalTablesProcedure {
    pub context: DdlContext,
@@ -125,14 +127,20 @@ impl AlterLogicalTablesProcedure {
            });
        }

-        // Collects responses from datanodes.
-        let phy_raw_schemas = future::join_all(alter_region_tasks)
+        let mut results = future::join_all(alter_region_tasks)
            .await
            .into_iter()
-            .map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
            .collect::<Result<Vec<_>>>()?;

+        // Collects responses from datanodes.
+        let phy_raw_schemas = results
+            .iter_mut()
+            .map(|res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY))
+            .collect::<Vec<_>>();
+
        if phy_raw_schemas.is_empty() {
+            self.submit_sync_region_requests(results, &physical_table_route.region_routes)
+                .await;
            self.data.state = AlterTablesState::UpdateMetadata;
            return Ok(Status::executing(true));
        }
@@ -155,10 +163,34 @@ impl AlterLogicalTablesProcedure {
            warn!("altering logical table result doesn't contains extension key `{ALTER_PHYSICAL_EXTENSION_KEY}`,leaving the physical table's schema unchanged");
        }

+        self.submit_sync_region_requests(results, &physical_table_route.region_routes)
+            .await;
        self.data.state = AlterTablesState::UpdateMetadata;
        Ok(Status::executing(true))
    }

+    async fn submit_sync_region_requests(
+        &self,
+        results: Vec<RegionResponse>,
+        region_routes: &[RegionRoute],
+    ) {
+        let table_info = &self.data.physical_table_info.as_ref().unwrap().table_info;
+        if let Err(err) = sync_follower_regions(
+            &self.context,
+            self.data.physical_table_id,
+            results,
+            region_routes,
+            table_info.meta.engine.as_str(),
+        )
+        .await
+        {
+            error!(err; "Failed to sync regions for table {}, table_id: {}",
+                        format_full_table_name(&table_info.catalog_name, &table_info.schema_name, &table_info.name),
+                        self.data.physical_table_id
+            );
+        }
+    }
+
    pub(crate) async fn on_update_metadata(&mut self) -> Result<Status> {
        self.update_physical_table_metadata().await?;
        self.update_logical_tables_metadata().await?;
--- a/src/common/meta/src/ddl/alter_table.rs
+++ b/src/common/meta/src/ddl/alter_table.rs
@@ -19,35 +19,39 @@ mod update_metadata;

 use std::vec;

+use api::region::RegionResponse;
 use api::v1::alter_table_expr::Kind;
 use api::v1::RenameTable;
 use async_trait::async_trait;
-use common_error::ext::ErrorExt;
-use common_error::status_code::StatusCode;
+use common_error::ext::BoxedError;
 use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
 use common_procedure::{
-    Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, Status, StringKey,
+    Context as ProcedureContext, ContextProvider, Error as ProcedureError, LockKey, PoisonKey,
+    PoisonKeys, Procedure, ProcedureId, Status, StringKey,
 };
 use common_telemetry::{debug, error, info};
-use futures::future;
+use futures::future::{self};
 use serde::{Deserialize, Serialize};
-use snafu::ResultExt;
+use snafu::{ensure, ResultExt};
 use store_api::storage::RegionId;
 use strum::AsRefStr;
 use table::metadata::{RawTableInfo, TableId, TableInfo};
 use table::table_reference::TableReference;

 use crate::cache_invalidator::Context;
-use crate::ddl::utils::add_peer_context_if_needed;
+use crate::ddl::utils::{
+    add_peer_context_if_needed, handle_multiple_results, sync_follower_regions, MultipleResults,
+};
 use crate::ddl::DdlContext;
-use crate::error::{Error, Result};
+use crate::error::{AbortProcedureSnafu, Error, NoLeaderSnafu, PutPoisonSnafu, Result};
 use crate::instruction::CacheIdent;
 use crate::key::table_info::TableInfoValue;
 use crate::key::{DeserializedValueWithBytes, RegionDistribution};
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
 use crate::metrics;
+use crate::poison_key::table_poison_key;
 use crate::rpc::ddl::AlterTableTask;
-use crate::rpc::router::{find_leader_regions, find_leaders, region_distribution};
+use crate::rpc::router::{find_leader_regions, find_leaders, region_distribution, RegionRoute};

 /// The alter table procedure
 pub struct AlterTableProcedure {
@@ -104,7 +108,27 @@ impl AlterTableProcedure {
        Ok(Status::executing(true))
    }

-    pub async fn submit_alter_region_requests(&mut self) -> Result<Status> {
+    fn table_poison_key(&self) -> PoisonKey {
+        table_poison_key(self.data.table_id())
+    }
+
+    async fn put_poison(
+        &self,
+        ctx_provider: &dyn ContextProvider,
+        procedure_id: ProcedureId,
+    ) -> Result<()> {
+        let poison_key = self.table_poison_key();
+        ctx_provider
+            .try_put_poison(&poison_key, procedure_id)
+            .await
+            .context(PutPoisonSnafu)
+    }
+
+    pub async fn submit_alter_region_requests(
+        &mut self,
+        procedure_id: ProcedureId,
+        ctx_provider: &dyn ContextProvider,
+    ) -> Result<Status> {
        let table_id = self.data.table_id();
        let (_, physical_table_route) = self
            .context
@@ -127,6 +151,9 @@ impl AlterTableProcedure {
            alter_kind,
        );

+        ensure!(!leaders.is_empty(), NoLeaderSnafu { table_id });
+        // Puts the poison before submitting alter region requests to datanodes.
+        self.put_poison(ctx_provider, procedure_id).await?;
        for datanode in leaders {
            let requester = self.context.node_manager.datanode(&datanode).await;
            let regions = find_leader_regions(&physical_table_route.region_routes, &datanode);
@@ -140,28 +167,73 @@ impl AlterTableProcedure {
                let requester = requester.clone();

                alter_region_tasks.push(async move {
-                    if let Err(err) = requester.handle(request).await {
-                        if err.status_code() != StatusCode::RequestOutdated {
-                            // Treat request outdated as success.
-                            // The engine will throw this code when the schema version not match.
-                            // As this procedure has locked the table, the only reason for this error
-                            // is procedure is succeeded before and is retrying.
-                            return Err(add_peer_context_if_needed(datanode)(err));
-                        }
-                    }
-                    Ok(())
+                    requester
+                        .handle(request)
+                        .await
+                        .map_err(add_peer_context_if_needed(datanode))
                });
            }
        }

-        future::join_all(alter_region_tasks)
+        let results = future::join_all(alter_region_tasks)
            .await
            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();

-        self.data.state = AlterTableState::UpdateMetadata;
+        match handle_multiple_results(results) {
+            MultipleResults::PartialRetryable(error) => {
+                // Just returns the error, and wait for the next try.
+                Err(error)
+            }
+            MultipleResults::PartialNonRetryable(error) => {
+                error!(error; "Partial non-retryable errors occurred during alter table, table {}, table_id: {}", self.data.table_ref(), self.data.table_id());
+                // No retry will be done.
+                Ok(Status::poisoned(
+                    Some(self.table_poison_key()),
+                    ProcedureError::external(error),
+                ))
+            }
+            MultipleResults::AllRetryable(error) => {
+                // Just returns the error, and wait for the next try.
+                Err(error)
+            }
+            MultipleResults::Ok(results) => {
+                self.submit_sync_region_requests(results, &physical_table_route.region_routes)
+                    .await;
+                self.data.state = AlterTableState::UpdateMetadata;
+                Ok(Status::executing_with_clean_poisons(true))
+            }
+            MultipleResults::AllNonRetryable(error) => {
+                error!(error; "All alter requests returned non-retryable errors for table {}, table_id: {}", self.data.table_ref(), self.data.table_id());
+                // It assumes the metadata on datanode is not changed.
+                // Case: The alter region request is sent but not applied. (e.g., InvalidArgument)

-        Ok(Status::executing(true))
+                let err = BoxedError::new(error);
+                Err(err).context(AbortProcedureSnafu {
+                    clean_poisons: true,
+                })
+            }
+        }
+    }
+
+    async fn submit_sync_region_requests(
+        &mut self,
+        results: Vec<RegionResponse>,
+        region_routes: &[RegionRoute],
+    ) {
+        // Safety: filled in `prepare` step.
+        let table_info = self.data.table_info().unwrap();
+        if let Err(err) = sync_follower_regions(
+            &self.context,
+            self.data.table_id(),
+            results,
+            region_routes,
+            table_info.meta.engine.as_str(),
+        )
+        .await
+        {
+            error!(err; "Failed to sync regions for table {}, table_id: {}", self.data.table_ref(), self.data.table_id());
+        }
    }

    /// Update table metadata.
@@ -250,10 +322,12 @@ impl Procedure for AlterTableProcedure {
        Self::TYPE_NAME
    }

-    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
+    async fn execute(&mut self, ctx: &ProcedureContext) -> ProcedureResult<Status> {
        let error_handler = |e: Error| {
            if e.is_retry_later() {
                ProcedureError::retry_later(e)
+            } else if e.need_clean_poisons() {
+                ProcedureError::external_and_clean_poisons(e)
            } else {
                ProcedureError::external(e)
            }
@@ -269,7 +343,10 @@ impl Procedure for AlterTableProcedure {

        match state {
            AlterTableState::Prepare => self.on_prepare().await,
-            AlterTableState::SubmitAlterRegionRequests => self.submit_alter_region_requests().await,
+            AlterTableState::SubmitAlterRegionRequests => {
+                self.submit_alter_region_requests(ctx.procedure_id, ctx.provider.as_ref())
+                    .await
+            }
            AlterTableState::UpdateMetadata => self.on_update_metadata().await,
            AlterTableState::InvalidateTableCache => self.on_broadcast().await,
        }
@@ -285,6 +362,10 @@ impl Procedure for AlterTableProcedure {

        LockKey::new(key)
    }
+
+    fn poison_keys(&self) -> PoisonKeys {
+        PoisonKeys::new(vec![self.table_poison_key()])
+    }
 }

 #[derive(Debug, Serialize, Deserialize, AsRefStr)]
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -35,9 +35,8 @@ use snafu::{ensure, ResultExt};
 use strum::AsRefStr;
 use table::metadata::TableId;

-use super::utils::add_peer_context_if_needed;
 use crate::cache_invalidator::Context;
-use crate::ddl::utils::handle_retry_error;
+use crate::ddl::utils::{add_peer_context_if_needed, handle_retry_error};
 use crate::ddl::DdlContext;
 use crate::error::{self, Result};
 use crate::instruction::{CacheIdent, CreateFlow};
@@ -308,8 +307,7 @@ impl Procedure for CreateFlowProcedure {
 }

 pub fn determine_flow_type(_flow_task: &CreateFlowTask) -> FlowType {
-    // TODO(discord9): determine flow type
-    FlowType::RecordingRule
+    FlowType::Batching
 }

 /// The state of [CreateFlowProcedure].
@@ -326,29 +324,30 @@ pub enum CreateFlowState {
 }

 /// The type of flow.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum FlowType {
-    /// The flow is a recording rule task.
-    RecordingRule,
+    /// The flow is a batching task.
+    Batching,
    /// The flow is a streaming task.
    Streaming,
 }

 impl FlowType {
-    pub const RECORDING_RULE: &str = "recording_rule";
+    pub const BATCHING: &str = "batching";
    pub const STREAMING: &str = "streaming";
+    pub const FLOW_TYPE_KEY: &str = "flow_type";
 }

 impl Default for FlowType {
    fn default() -> Self {
-        Self::RecordingRule
+        Self::Batching
    }
 }

 impl fmt::Display for FlowType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            FlowType::RecordingRule => write!(f, "{}", FlowType::RECORDING_RULE),
+            FlowType::Batching => write!(f, "{}", FlowType::BATCHING),
            FlowType::Streaming => write!(f, "{}", FlowType::STREAMING),
        }
    }
@@ -391,7 +390,8 @@ impl From<&CreateFlowData> for CreateRequest {
        };

        let flow_type = value.flow_type.unwrap_or_default().to_string();
-        req.flow_options.insert("flow_type".to_string(), flow_type);
+        req.flow_options
+            .insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
        req
    }
 }
--- a/src/common/meta/src/ddl/create_logical_tables.rs
+++ b/src/common/meta/src/ddl/create_logical_tables.rs
@@ -17,12 +17,14 @@ mod metadata;
 mod region_request;
 mod update_metadata;

+use api::region::RegionResponse;
 use api::v1::CreateTableExpr;
 use async_trait::async_trait;
+use common_catalog::consts::METRIC_ENGINE;
 use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
 use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status};
-use common_telemetry::{debug, warn};
-use futures_util::future::join_all;
+use common_telemetry::{debug, error, warn};
+use futures::future;
 use serde::{Deserialize, Serialize};
 use snafu::{ensure, ResultExt};
 use store_api::metadata::ColumnMetadata;
@@ -31,7 +33,7 @@ use store_api::storage::{RegionId, RegionNumber};
 use strum::AsRefStr;
 use table::metadata::{RawTableInfo, TableId};

-use crate::ddl::utils::{add_peer_context_if_needed, handle_retry_error};
+use crate::ddl::utils::{add_peer_context_if_needed, handle_retry_error, sync_follower_regions};
 use crate::ddl::DdlContext;
 use crate::error::{DecodeJsonSnafu, MetadataCorruptionSnafu, Result};
 use crate::key::table_route::TableRouteValue;
@@ -156,14 +158,20 @@ impl CreateLogicalTablesProcedure {
            });
        }

-        // Collects response from datanodes.
-        let phy_raw_schemas = join_all(create_region_tasks)
+        let mut results = future::join_all(create_region_tasks)
            .await
            .into_iter()
-            .map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
            .collect::<Result<Vec<_>>>()?;

+        // Collects response from datanodes.
+        let phy_raw_schemas = results
+            .iter_mut()
+            .map(|res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY))
+            .collect::<Vec<_>>();
+
        if phy_raw_schemas.is_empty() {
+            self.submit_sync_region_requests(results, region_routes)
+                .await;
            self.data.state = CreateTablesState::CreateMetadata;
            return Ok(Status::executing(false));
        }
@@ -186,10 +194,30 @@ impl CreateLogicalTablesProcedure {
            warn!("creating logical table result doesn't contains extension key `{ALTER_PHYSICAL_EXTENSION_KEY}`,leaving the physical table's schema unchanged");
        }

+        self.submit_sync_region_requests(results, region_routes)
+            .await;
        self.data.state = CreateTablesState::CreateMetadata;

        Ok(Status::executing(true))
    }
+
+    async fn submit_sync_region_requests(
+        &self,
+        results: Vec<RegionResponse>,
+        region_routes: &[RegionRoute],
+    ) {
+        if let Err(err) = sync_follower_regions(
+            &self.context,
+            self.data.physical_table_id,
+            results,
+            region_routes,
+            METRIC_ENGINE,
+        )
+        .await
+        {
+            error!(err; "Failed to sync regions for physical table_id: {}",self.data.physical_table_id);
+        }
+    }
 }

 #[async_trait]
--- a/src/common/meta/src/ddl/create_table.rs
+++ b/src/common/meta/src/ddl/create_table.rs
@@ -299,7 +299,9 @@ impl Procedure for CreateTableProcedure {
                .creator
                .register_opening_regions(&self.context, &x.region_routes)
                .map_err(BoxedError::new)
-                .context(ExternalSnafu)?;
+                .context(ExternalSnafu {
+                    clean_poisons: false,
+                })?;
        }

        Ok(())
--- a/src/common/meta/src/ddl/drop_database.rs
+++ b/src/common/meta/src/ddl/drop_database.rs
@@ -130,7 +130,9 @@ impl Procedure for DropDatabaseProcedure {
        self.state
            .recover(&self.runtime_context)
            .map_err(BoxedError::new)
-            .context(ExternalSnafu)
+            .context(ExternalSnafu {
+                clean_poisons: false,
+            })
    }

    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
--- a/src/common/meta/src/ddl/drop_database/cursor.rs
+++ b/src/common/meta/src/ddl/drop_database/cursor.rs
@@ -22,11 +22,10 @@ use snafu::OptionExt;
 use table::metadata::{TableId, TableType};
 use table::table_name::TableName;

-use super::executor::DropDatabaseExecutor;
-use super::metadata::DropDatabaseRemoveMetadata;
-use super::DropTableTarget;
 use crate::cache_invalidator::Context;
-use crate::ddl::drop_database::{DropDatabaseContext, State};
+use crate::ddl::drop_database::executor::DropDatabaseExecutor;
+use crate::ddl::drop_database::metadata::DropDatabaseRemoveMetadata;
+use crate::ddl::drop_database::{DropDatabaseContext, DropTableTarget, State};
 use crate::ddl::DdlContext;
 use crate::error::{Result, TableInfoNotFoundSnafu};
 use crate::instruction::CacheIdent;
--- a/src/common/meta/src/ddl/drop_database/executor.rs
+++ b/src/common/meta/src/ddl/drop_database/executor.rs
@@ -22,9 +22,8 @@ use snafu::OptionExt;
 use table::metadata::TableId;
 use table::table_name::TableName;

-use super::cursor::DropDatabaseCursor;
-use super::{DropDatabaseContext, DropTableTarget};
-use crate::ddl::drop_database::State;
+use crate::ddl::drop_database::cursor::DropDatabaseCursor;
+use crate::ddl::drop_database::{DropDatabaseContext, DropTableTarget, State};
 use crate::ddl::drop_table::executor::DropTableExecutor;
 use crate::ddl::utils::extract_region_wal_options;
 use crate::ddl::DdlContext;
--- a/src/common/meta/src/ddl/drop_database/metadata.rs
+++ b/src/common/meta/src/ddl/drop_database/metadata.rs
@@ -17,8 +17,8 @@ use std::any::Any;
 use common_procedure::Status;
 use serde::{Deserialize, Serialize};

-use super::end::DropDatabaseEnd;
 use crate::cache_invalidator::Context;
+use crate::ddl::drop_database::end::DropDatabaseEnd;
 use crate::ddl::drop_database::{DropDatabaseContext, State};
 use crate::ddl::DdlContext;
 use crate::error::Result;
--- a/Show More
+++ b/Show More