insert some rows and query them across DDL to test the integrity of data

fix: fix bug
fix: test program
2025-12-26 16:10:02 +00:00 · 2024-05-14 20:41:47 +08:00 · 2024-05-13 08:25:18 +00:00 · 2024-05-13 07:46:30 +00:00 · 2024-05-13 02:44:34 +00:00 · 2024-04-19 14:14:34 +00:00
248 changed files with 13062 additions and 2762 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,27 @@
+# GreptimeDB CODEOWNERS
+
+# These owners will be the default owners for everything in the repo.
+
+* @GreptimeTeam/db-approver
+
+## [Module] Databse Engine
+/src/index @zhongzc
+/src/mito2 @evenyag @v0y4g3r @waynexia
+/src/query @evenyag
+
+## [Module] Distributed
+/src/common/meta @MichaelScofield
+/src/common/procedure @MichaelScofield
+/src/meta-client @MichaelScofield
+/src/meta-srv @MichaelScofield
+
+## [Module] Write Ahead Log
+/src/log-store @v0y4g3r
+/src/store-api @v0y4g3r
+
+## [Module] Metrics Engine
+/src/metric-engine @waynexia
+/src/promql @waynexia
+
+## [Module] Flow
+/src/flow @zhongzc @waynexia
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -39,7 +39,7 @@ body:
        - Query Engine
        - Table Engine
        - Write Protocols
-        - MetaSrv
+        - Metasrv
        - Frontend
        - Datanode
        - Other
--- a/.github/actions/build-windows-artifacts/action.yml
+++ b/.github/actions/build-windows-artifacts/action.yml
@@ -26,6 +26,8 @@ runs:
  using: composite
  steps:
    - uses: arduino/setup-protoc@v3
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}

    - name: Install rust toolchain
      uses: dtolnay/rust-toolchain@master
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -33,12 +33,17 @@ env:
  RUST_TOOLCHAIN: nightly-2023-12-19

 jobs:
-  typos:
-    name: Spell Check with Typos
+  check-typos-and-docs:
+    name: Check typos and docs
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v4
      - uses: crate-ci/typos@v1.13.10
+      - name: Check the config docs
+        run: |
+          make config-docs && \
+          git diff --name-only --exit-code ./config/config.md  \
+          || (echo "'config/config.md' is not up-to-date, please run 'make config-docs'." && exit 1)

  check:
    name: Check
@@ -93,6 +98,8 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - uses: arduino/setup-protoc@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
      - uses: dtolnay/rust-toolchain@master
        with:
          toolchain: ${{ env.RUST_TOOLCHAIN }}
@@ -123,10 +130,12 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        target: [ "fuzz_create_table", "fuzz_alter_table" ]
+        target: [ "fuzz_create_table", "fuzz_alter_table", "fuzz_create_database" ]
    steps:
      - uses: actions/checkout@v4
      - uses: arduino/setup-protoc@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
      - uses: dtolnay/rust-toolchain@master
        with:
          toolchain: ${{ env.RUST_TOOLCHAIN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -793,24 +793,6 @@ dependencies = [
 "syn 2.0.43",
 ]

-[[package]]
-name = "axum-test-helper"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "298f62fa902c2515c169ab0bfb56c593229f33faa01131215d58e3d4898e3aa9"
-dependencies = [
- "axum",
- "bytes",
- "http",
- "http-body",
- "hyper",
- "reqwest",
- "serde",
- "tokio",
- "tower",
- "tower-service",
-]
-
 [[package]]
 name = "backon"
 version = "0.4.1"
@@ -1092,6 +1074,12 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "bufstream"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40e38929add23cdf8a366df9b0e088953150724bcbe5fc330b0d8eb3b328eec8"
+
 [[package]]
 name = "build-data"
 version = "0.1.5"
@@ -1650,6 +1638,7 @@ dependencies = [
 "substrait 0.7.2",
 "table",
 "temp-env",
+ "tempfile",
 "tikv-jemallocator",
 "tokio",
 "toml 0.8.8",
@@ -2303,9 +2292,9 @@ dependencies = [

 [[package]]
 name = "crc32fast"
-version = "1.3.2"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa"
 dependencies = [
 "cfg-if 1.0.0",
 ]
@@ -2974,6 +2963,17 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "derive_utils"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61bb5a1014ce6dfc2a378578509abe775a5aa06bff584a547555d9efdb81b926"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.43",
+]
+
 [[package]]
 name = "diff"
 version = "0.1.13"
@@ -3406,21 +3406,32 @@ name = "flow"
 version = "0.7.2"
 dependencies = [
 "api",
+ "catalog",
+ "common-catalog",
 "common-decimal",
 "common-error",
 "common-macro",
 "common-telemetry",
 "common-time",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-substrait",
 "datatypes",
 "enum_dispatch",
 "hydroflow",
 "itertools 0.10.5",
 "num-traits",
+ "prost 0.12.3",
+ "query",
 "serde",
 "serde_json",
 "servers",
+ "session",
 "smallvec",
 "snafu",
+ "strum 0.25.0",
+ "substrait 0.7.2",
+ "table",
 "tokio",
 "tonic 0.10.2",
 ]
@@ -3431,6 +3442,21 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"

+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -3781,7 +3807,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=1bd2398b686e5ac6c1eef6daf615867ce27f75c1#1bd2398b686e5ac6c1eef6daf615867ce27f75c1"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=04d78b6e025ceb518040fdd10858c2a9d9345820#04d78b6e025ceb518040fdd10858c2a9d9345820"
 dependencies = [
 "prost 0.12.3",
 "serde",
@@ -3794,9 +3820,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.24"
+version = "0.3.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
+checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
 dependencies = [
 "bytes",
 "fnv",
@@ -4059,9 +4085,8 @@ dependencies = [

 [[package]]
 name = "hydroflow"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5129724896b4c3cf12f8e5f5af2f1d94b4c5933ae911189747025c6a5ff1346"
+version = "0.6.0"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "bincode",
 "byteorder",
@@ -4092,9 +4117,8 @@ dependencies = [

 [[package]]
 name = "hydroflow_datalog"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41813c88b02f3bfa8f5962e125495aa47c8d382cf5d135b02da40af4342bc6fb"
+version = "0.6.0"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "hydroflow_datalog_core",
 "proc-macro-crate 1.3.1",
@@ -4105,9 +4129,8 @@ dependencies = [

 [[package]]
 name = "hydroflow_datalog_core"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea77a3b2f09bba3d461f9ce0dee28798d3b07dafe77fc46de4675155f5925e53"
+version = "0.6.0"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "hydroflow_lang",
 "proc-macro-crate 1.3.1",
@@ -4121,9 +4144,8 @@ dependencies = [

 [[package]]
 name = "hydroflow_lang"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3191eee8ef49b4a814e4c33a0ce0d7470b733dc6118ea744f7f15168c38803f"
+version = "0.6.0"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "auto_impl",
 "clap 4.4.11",
@@ -4142,9 +4164,8 @@ dependencies = [

 [[package]]
 name = "hydroflow_macro"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9be25d2a927fe4e6afe3e204786e968e983f53f313cc561950ff1cd09ecd92fc"
+version = "0.6.0"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "hydroflow_lang",
 "itertools 0.10.5",
@@ -4425,6 +4446,15 @@ version = "0.3.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8573b2b1fb643a372c73b23f4da5f888677feef3305146d68a539250a9bccc7"

+[[package]]
+name = "io-enum"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53b53d712d99a73eec59ee5e4fe6057f8052142d38eeafbbffcb06b36d738a6e"
+dependencies = [
+ "derive_utils",
+]
+
 [[package]]
 name = "io-lifetimes"
 version = "1.0.11"
@@ -4646,9 +4676,8 @@ dependencies = [

 [[package]]
 name = "lattices"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f3bff82353a971b61106a49369cfc1bd8398661107eadcb5387fcd21c43cac9"
+version = "0.5.3"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "cc-traits",
 "sealed",
@@ -5329,6 +5358,7 @@ dependencies = [
 "common-test-util",
 "common-time",
 "common-wal",
+ "crc32fast",
 "criterion",
 "datafusion",
 "datafusion-common",
@@ -5438,6 +5468,32 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b"

+[[package]]
+name = "mysql"
+version = "25.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4cc09a8118051e4617886c9c6e693c61444c2eeb5f9a792dc5d631501706565"
+dependencies = [
+ "bufstream",
+ "bytes",
+ "crossbeam",
+ "flate2",
+ "io-enum",
+ "libc",
+ "lru",
+ "mysql_common 0.32.0",
+ "named_pipe",
+ "native-tls",
+ "once_cell",
+ "pem",
+ "percent-encoding",
+ "serde",
+ "serde_json",
+ "socket2 0.5.5",
+ "twox-hash",
+ "url",
+]
+
 [[package]]
 name = "mysql-common-derive"
 version = "0.30.2"
@@ -5619,6 +5675,33 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "named_pipe"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad9c443cce91fc3e12f017290db75dde490d685cdaaf508d7159d7cf41f0eb2b"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "native-tls"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "ndk-context"
 version = "0.1.1"
@@ -6023,12 +6106,50 @@ dependencies = [
 "tokio-rustls 0.25.0",
 ]

+[[package]]
+name = "openssl"
+version = "0.10.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
+dependencies = [
+ "bitflags 2.4.1",
+ "cfg-if 1.0.0",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.43",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

+[[package]]
+name = "openssl-sys"
+version = "0.9.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "opentelemetry"
 version = "0.21.0"
@@ -7294,9 +7415,8 @@ checksum = "3b7e158a385023d209d6d5f2585c4b468f6dcb3dd5aca9b75c4f1678c05bb375"

 [[package]]
 name = "pusherator"
-version = "0.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd486cb5153e0d8fa91d3daebae48917ae299b2569cc79901922f3923dc312ef"
+version = "0.0.5"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "either",
 "variadics",
@@ -9036,7 +9156,6 @@ dependencies = [
 "auth",
 "axum",
 "axum-macros",
- "axum-test-helper",
 "base64 0.21.5",
 "bytes",
 "catalog",
@@ -9067,6 +9186,7 @@ dependencies = [
 "hashbrown 0.14.3",
 "headers",
 "hostname",
+ "http",
 "http-body",
 "humantime-serde",
 "hyper",
@@ -10073,6 +10193,32 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"

+[[package]]
+name = "tests-chaos"
+version = "0.7.2"
+dependencies = [
+ "axum",
+ "axum-macros",
+ "common-error",
+ "common-macro",
+ "common-telemetry",
+ "common-time",
+ "lazy_static",
+ "mysql",
+ "nix 0.26.4",
+ "prometheus",
+ "rand",
+ "rand_chacha",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "snafu",
+ "sqlx",
+ "tests-fuzz",
+ "tinytemplate",
+ "tokio",
+]
+
 [[package]]
 name = "tests-fuzz"
 version = "0.7.2"
@@ -10090,6 +10236,7 @@ dependencies = [
 "dotenv",
 "lazy_static",
 "libfuzzer-sys",
+ "mysql",
 "partition",
 "rand",
 "rand_chacha",
@@ -10111,7 +10258,6 @@ dependencies = [
 "async-trait",
 "auth",
 "axum",
- "axum-test-helper",
 "catalog",
 "chrono",
 "client",
@@ -11368,9 +11514,11 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

 [[package]]
 name = "variadics"
-version = "0.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4500f518837578bf2d62d9c12f47ecb5b5279da689574793b7bace8138b4784"
+version = "0.0.4"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
+dependencies = [
+ "sealed",
+]

 [[package]]
 name = "vcpkg"
@@ -11447,6 +11595,12 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.89"
@@ -11602,11 +11756,12 @@ dependencies = [

 [[package]]
 name = "whoami"
-version = "1.4.1"
+version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
 dependencies = [
- "wasm-bindgen",
+ "redox_syscall 0.4.1",
+ "wasite",
 "web-sys",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -55,6 +55,7 @@ members = [
    "src/store-api",
    "src/table",
    "src/index",
+    "tests-chaos",
    "tests-fuzz",
    "tests-integration",
    "tests/runner",
@@ -104,7 +105,7 @@ etcd-client = "0.12"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "1bd2398b686e5ac6c1eef6daf615867ce27f75c1" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "04d78b6e025ceb518040fdd10858c2a9d9345820" }
 humantime = "2.1"
 humantime-serde = "1.1"
 itertools = "0.10"
@@ -133,6 +134,7 @@ reqwest = { version = "0.11", default-features = false, features = [
    "json",
    "rustls-tls-native-roots",
    "stream",
+    "multipart",
 ] }
 rskafka = "0.5"
 rust_decimal = "1.33"
@@ -211,6 +213,7 @@ sql = { path = "src/sql" }
 store-api = { path = "src/store-api" }
 substrait = { path = "src/common/substrait" }
 table = { path = "src/table" }
+tests-fuzz = { path = "tests-fuzz" }

 [workspace.dependencies.meter-macros]
 git = "https://github.com/GreptimeTeam/greptime-meter.git"
--- a/14
+++ b/14
@@ -169,6 +169,10 @@ check: ## Cargo check all the targets.
 clippy: ## Check clippy rules.
 	cargo clippy --workspace --all-targets --all-features -- -D warnings

+.PHONY: fix-clippy
+fix-clippy: ## Fix clippy violations.
+	cargo clippy --workspace --all-targets --all-features --fix
+
 .PHONY: fmt-check
 fmt-check: ## Check code format.
 	cargo fmt --all -- --check
@@ -188,6 +192,16 @@ run-it-in-container: start-etcd ## Run integration tests in dev-builder.
 	-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:latest \
 	make test sqlness-test BUILD_JOBS=${BUILD_JOBS}

+##@ Docs
+config-docs: ## Generate configuration documentation from toml files.
+	docker run --rm \
+    -v ${PWD}:/greptimedb \
+    -w /greptimedb/config \
+    toml2docs/toml2docs:latest \
+    -p '##' \
+    -t ./config-docs-template.md \
+    -o ./config.md
+
 ##@ General

 # The help target prints out all targets with their descriptions organized
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ cargo run -- standalone start
 - [GreptimeDB C++ Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-cpp)
 - [GreptimeDB Erlang Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-erl)
 - [GreptimeDB Rust Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-rust)
- [GreptimeDB JavaScript Ingester](https://github.com/GreptimeTeam/greptime-ingester-js)
+- [GreptimeDB JavaScript Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-js)

 ### Grafana Dashboard

--- a/cliff.toml
+++ b/cliff.toml
@@ -53,7 +53,7 @@ Release date: {{ timestamp | date(format="%B %d, %Y") }}
  ## New Contributors
 {% endif -%}
 {% for contributor in github.contributors | filter(attribute="is_first_time", value=true) %}
-  * @{{ contributor.username }} made their first contribution
+  * [@{{ contributor.username }}](https://github.com/{{ contributor.username }}) made their first contribution
    {%- if contributor.pr_number %} in \
      [#{{ contributor.pr_number }}]({{ self::remote_url() }}/pull/{{ contributor.pr_number }}) \
    {%- endif %}
@@ -65,7 +65,17 @@ Release date: {{ timestamp | date(format="%B %d, %Y") }}

 We would like to thank the following contributors from the GreptimeDB community:

-{{ github.contributors | map(attribute="username") | join(sep=", ") }}
+{%- set contributors = github.contributors | sort(attribute="username") | map(attribute="username") -%}
+{%- set bots = ['dependabot[bot]'] %}
+
+{% for contributor in contributors %}
+{%- if bots is containing(contributor) -%}{% continue %}{%- endif -%}
+{%- if loop.first -%}
+  [@{{ contributor }}](https://github.com/{{ contributor }})
+{%- else -%}
+  , [@{{ contributor }}](https://github.com/{{ contributor }})
+{%- endif -%}
+{%- endfor %}
 {%- endif %}
 {% raw %}\n{% endraw %}

--- a/config/config-docs-template.md
+++ b/config/config-docs-template.md
@@ -0,0 +1,19 @@
+# Configurations
+
+## Standalone Mode
+
+{{ toml2docs "./standalone.example.toml" }}
+
+## Cluster Mode
+
+### Frontend
+
+{{ toml2docs "./frontend.example.toml" }}
+
+### Metasrv
+
+{{ toml2docs "./metasrv.example.toml" }}
+
+### Datanode
+
+{{ toml2docs "./datanode.example.toml" }}
--- a/config/config.md
+++ b/config/config.md
@@ -0,0 +1,376 @@
+# Configurations
+
+## Standalone Mode
+
+| Key | Type | Default | Descriptions |
+| --- | -----| ------- | ----------- |
+| `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. |
+| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. |
+| `default_timezone` | String | `None` | The default timezone of the server. |
+| `http` | -- | -- | The HTTP server options. |
+| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
+| `http.timeout` | String | `30s` | HTTP request timeout. |
+| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>Support the following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`. |
+| `grpc` | -- | -- | The gRPC server options. |
+| `grpc.addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
+| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
+| `mysql` | -- | -- | MySQL server options. |
+| `mysql.enable` | Bool | `true` | Whether to enable. |
+| `mysql.addr` | String | `127.0.0.1:4002` | The addr to bind the MySQL server. |
+| `mysql.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `mysql.tls` | -- | -- | -- |
+| `mysql.tls.mode` | String | `disable` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- `disable` (default value)<br/>- `prefer`<br/>- `require`<br/>- `verify-ca`<br/>- `verify-full` |
+| `mysql.tls.cert_path` | String | `None` | Certificate file path. |
+| `mysql.tls.key_path` | String | `None` | Private key file path. |
+| `mysql.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
+| `postgres` | -- | -- | PostgresSQL server options. |
+| `postgres.enable` | Bool | `true` | Whether to enable |
+| `postgres.addr` | String | `127.0.0.1:4003` | The addr to bind the PostgresSQL server. |
+| `postgres.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `postgres.tls` | -- | -- | PostgresSQL server TLS options, see `mysql_options.tls` section. |
+| `postgres.tls.mode` | String | `disable` | TLS mode. |
+| `postgres.tls.cert_path` | String | `None` | Certificate file path. |
+| `postgres.tls.key_path` | String | `None` | Private key file path. |
+| `postgres.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
+| `opentsdb` | -- | -- | OpenTSDB protocol options. |
+| `opentsdb.enable` | Bool | `true` | Whether to enable |
+| `opentsdb.addr` | String | `127.0.0.1:4242` | OpenTSDB telnet API server address. |
+| `opentsdb.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `influxdb` | -- | -- | InfluxDB protocol options. |
+| `influxdb.enable` | Bool | `true` | Whether to enable InfluxDB protocol in HTTP API. |
+| `prom_store` | -- | -- | Prometheus remote storage options |
+| `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. |
+| `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. |
+| `wal` | -- | -- | The WAL options. |
+| `wal.provider` | String | `raft_engine` | The provider of the WAL.<br/>- `raft_engine`: the wal is stored in the local file system by raft-engine.<br/>- `kafka`: it's remote wal that data is stored in Kafka. |
+| `wal.dir` | String | `None` | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.file_size` | String | `256MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_threshold` | String | `4GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_interval` | String | `10m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.read_batch_size` | Integer | `128` | The read batch size.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.sync_write` | Bool | `false` | Whether to use sync write.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.enable_log_recycle` | Bool | `true` | Whether to reuse logically truncated log files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.prefill_log_files` | Bool | `false` | Whether to pre-create log files on start up.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.sync_period` | String | `10s` | Duration for fsyncing log files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.broker_endpoints` | Array | -- | The Kafka broker endpoints.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.max_batch_size` | String | `1MB` | The max size of a single producer batch.<br/>Warning: Kafka has a default limit of 1MB per message in a topic.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.linger` | String | `200ms` | The linger duration of a kafka batch producer.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.consumer_wait_timeout` | String | `100ms` | The consumer wait timeout.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_init` | String | `500ms` | The initial backoff delay.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_max` | String | `10s` | The maximum backoff delay.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_base` | Integer | `2` | The exponential backoff rate, i.e. next backoff = base * current backoff.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_deadline` | String | `5mins` | The deadline of retries.<br/>**It's only used when the provider is `kafka`**. |
+| `metadata_store` | -- | -- | Metadata storage options. |
+| `metadata_store.file_size` | String | `256MB` | Kv file size in bytes. |
+| `metadata_store.purge_threshold` | String | `4GB` | Kv purge threshold. |
+| `procedure` | -- | -- | Procedure storage options. |
+| `procedure.max_retry_times` | Integer | `3` | Procedure max retry time. |
+| `procedure.retry_delay` | String | `500ms` | Initial retry delay of procedures, increases exponentially |
+| `storage` | -- | -- | The data storage options. |
+| `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
+| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
+| `storage.cache_path` | String | `None` | Cache configuration for object storage such as 'S3' etc.<br/>The local file cache directory. |
+| `storage.cache_capacity` | String | `None` | The local file cache capacity in bytes. |
+| `storage.bucket` | String | `None` | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
+| `storage.root` | String | `None` | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
+| `storage.access_key_id` | String | `None` | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
+| `storage.secret_access_key` | String | `None` | The secret access key of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3`**. |
+| `storage.access_key_secret` | String | `None` | The secret access key of the aliyun account.<br/>**It's only used when the storage type is `Oss`**. |
+| `storage.account_name` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.account_key` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.scope` | String | `None` | The scope of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.credential_path` | String | `None` | The credential path of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.container` | String | `None` | The container of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.sas_token` | String | `None` | The sas token of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.endpoint` | String | `None` | The endpoint of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `storage.region` | String | `None` | The region of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `[[region_engine]]` | -- | -- | The region engine options. You can configure multiple region engines. |
+| `region_engine.mito` | -- | -- | The Mito engine options. |
+| `region_engine.mito.num_workers` | Integer | `8` | Number of region workers. |
+| `region_engine.mito.worker_channel_size` | Integer | `128` | Request channel size of each worker. |
+| `region_engine.mito.worker_request_batch_size` | Integer | `64` | Max batch size for a worker to handle requests. |
+| `region_engine.mito.manifest_checkpoint_distance` | Integer | `10` | Number of meta action updated to trigger a new checkpoint for the manifest. |
+| `region_engine.mito.compress_manifest` | Bool | `false` | Whether to compress manifest and checkpoint file by gzip (default false). |
+| `region_engine.mito.max_background_jobs` | Integer | `4` | Max number of running background jobs |
+| `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
+| `region_engine.mito.global_write_buffer_size` | String | `1GB` | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
+| `region_engine.mito.global_write_buffer_reject_size` | String | `2GB` | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
+| `region_engine.mito.sst_meta_cache_size` | String | `128MB` | Cache size for SST metadata. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
+| `region_engine.mito.vector_cache_size` | String | `512MB` | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.page_cache_size` | String | `512MB` | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
+| `region_engine.mito.scan_parallelism` | Integer | `0` | Parallelism to scan a region (default: 1/4 of cpu cores).<br/>- `0`: using the default value (1/4 of cpu cores).<br/>- `1`: scan in current thread.<br/>- `n`: scan in parallelism n. |
+| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
+| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
+| `region_engine.mito.inverted_index` | -- | -- | The options for inverted index in Mito engine. |
+| `region_engine.mito.inverted_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically<br/>- `disable`: never |
+| `region_engine.mito.inverted_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically<br/>- `disable`: never |
+| `region_engine.mito.inverted_index.apply_on_query` | String | `auto` | Whether to apply the index on query<br/>- `auto`: automatically<br/>- `disable`: never |
+| `region_engine.mito.inverted_index.mem_threshold_on_create` | String | `64M` | Memory threshold for performing an external sort during index creation.<br/>Setting to empty will disable external sorting, forcing all sorting operations to happen in memory. |
+| `region_engine.mito.inverted_index.intermediate_path` | String | `""` | File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`). |
+| `region_engine.mito.memtable` | -- | -- | -- |
+| `region_engine.mito.memtable.type` | String | `time_series` | Memtable type.<br/>- `time_series`: time-series memtable<br/>- `partition_tree`: partition tree memtable (experimental) |
+| `region_engine.mito.memtable.index_max_keys_per_shard` | Integer | `8192` | The max number of keys in one shard.<br/>Only available for `partition_tree` memtable. |
+| `region_engine.mito.memtable.data_freeze_threshold` | Integer | `32768` | The max rows of data inside the actively writing buffer in one shard.<br/>Only available for `partition_tree` memtable. |
+| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
+| `logging` | -- | -- | The logging options. |
+| `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. |
+| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
+| `logging.otlp_endpoint` | String | `None` | The OTLP tracing endpoint. |
+| `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
+| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.<br/>Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.<br/>ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
+| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
+| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
+| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
+| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself |
+| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.remote_write` | -- | -- | -- |
+| `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`. |
+| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
+
+
+## Cluster Mode
+
+### Frontend
+
+| Key | Type | Default | Descriptions |
+| --- | -----| ------- | ----------- |
+| `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. |
+| `default_timezone` | String | `None` | The default timezone of the server. |
+| `heartbeat` | -- | -- | The heartbeat options. |
+| `heartbeat.interval` | String | `18s` | Interval for sending heartbeat messages to the metasrv. |
+| `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. |
+| `http` | -- | -- | The HTTP server options. |
+| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
+| `http.timeout` | String | `30s` | HTTP request timeout. |
+| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>Support the following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`. |
+| `grpc` | -- | -- | The gRPC server options. |
+| `grpc.addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
+| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
+| `mysql` | -- | -- | MySQL server options. |
+| `mysql.enable` | Bool | `true` | Whether to enable. |
+| `mysql.addr` | String | `127.0.0.1:4002` | The addr to bind the MySQL server. |
+| `mysql.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `mysql.tls` | -- | -- | -- |
+| `mysql.tls.mode` | String | `disable` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- `disable` (default value)<br/>- `prefer`<br/>- `require`<br/>- `verify-ca`<br/>- `verify-full` |
+| `mysql.tls.cert_path` | String | `None` | Certificate file path. |
+| `mysql.tls.key_path` | String | `None` | Private key file path. |
+| `mysql.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
+| `postgres` | -- | -- | PostgresSQL server options. |
+| `postgres.enable` | Bool | `true` | Whether to enable |
+| `postgres.addr` | String | `127.0.0.1:4003` | The addr to bind the PostgresSQL server. |
+| `postgres.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `postgres.tls` | -- | -- | PostgresSQL server TLS options, see `mysql_options.tls` section. |
+| `postgres.tls.mode` | String | `disable` | TLS mode. |
+| `postgres.tls.cert_path` | String | `None` | Certificate file path. |
+| `postgres.tls.key_path` | String | `None` | Private key file path. |
+| `postgres.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
+| `opentsdb` | -- | -- | OpenTSDB protocol options. |
+| `opentsdb.enable` | Bool | `true` | Whether to enable |
+| `opentsdb.addr` | String | `127.0.0.1:4242` | OpenTSDB telnet API server address. |
+| `opentsdb.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `influxdb` | -- | -- | InfluxDB protocol options. |
+| `influxdb.enable` | Bool | `true` | Whether to enable InfluxDB protocol in HTTP API. |
+| `prom_store` | -- | -- | Prometheus remote storage options |
+| `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. |
+| `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. |
+| `meta_client` | -- | -- | The metasrv client options. |
+| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
+| `meta_client.timeout` | String | `3s` | Operation timeout. |
+| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
+| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
+| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
+| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
+| `meta_client.metadata_cache_max_capacity` | Integer | `100000` | The configuration about the cache of the metadata. |
+| `meta_client.metadata_cache_ttl` | String | `10m` | TTL of the metadata cache. |
+| `meta_client.metadata_cache_tti` | String | `5m` | -- |
+| `datanode` | -- | -- | Datanode options. |
+| `datanode.client` | -- | -- | Datanode client options. |
+| `datanode.client.timeout` | String | `10s` | -- |
+| `datanode.client.connect_timeout` | String | `10s` | -- |
+| `datanode.client.tcp_nodelay` | Bool | `true` | -- |
+| `logging` | -- | -- | The logging options. |
+| `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. |
+| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
+| `logging.otlp_endpoint` | String | `None` | The OTLP tracing endpoint. |
+| `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
+| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.<br/>Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.<br/>ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
+| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
+| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
+| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
+| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself |
+| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.remote_write` | -- | -- | -- |
+| `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`. |
+| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
+
+
+### Metasrv
+
+| Key | Type | Default | Descriptions |
+| --- | -----| ------- | ----------- |
+| `data_home` | String | `/tmp/metasrv/` | The working home directory. |
+| `bind_addr` | String | `127.0.0.1:3002` | The bind address of metasrv. |
+| `server_addr` | String | `127.0.0.1:3002` | The communication server address for frontend and datanode to connect to metasrv,  "127.0.0.1:3002" by default for localhost. |
+| `store_addr` | String | `127.0.0.1:2379` | Etcd server address. |
+| `selector` | String | `lease_based` | Datanode selector type.<br/>- `lease_based` (default value).<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
+| `use_memory_store` | Bool | `false` | Store data in memory. |
+| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. |
+| `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. |
+| `procedure` | -- | -- | Procedure storage options. |
+| `procedure.max_retry_times` | Integer | `12` | Procedure max retry time. |
+| `procedure.retry_delay` | String | `500ms` | Initial retry delay of procedures, increases exponentially |
+| `procedure.max_metadata_value_size` | String | `1500KiB` | Auto split large value<br/>GreptimeDB procedure uses etcd as the default metadata storage backend.<br/>The etcd the maximum size of any request is 1.5 MiB<br/>1500KiB = 1536KiB (1.5MiB) - 36KiB (reserved size of key)<br/>Comments out the `max_metadata_value_size`, for don't split large value (no limit). |
+| `failure_detector` | -- | -- | -- |
+| `failure_detector.threshold` | Float | `8.0` | -- |
+| `failure_detector.min_std_deviation` | String | `100ms` | -- |
+| `failure_detector.acceptable_heartbeat_pause` | String | `3000ms` | -- |
+| `failure_detector.first_heartbeat_estimate` | String | `1000ms` | -- |
+| `datanode` | -- | -- | Datanode options. |
+| `datanode.client` | -- | -- | Datanode client options. |
+| `datanode.client.timeout` | String | `10s` | -- |
+| `datanode.client.connect_timeout` | String | `10s` | -- |
+| `datanode.client.tcp_nodelay` | Bool | `true` | -- |
+| `wal` | -- | -- | -- |
+| `wal.provider` | String | `raft_engine` | -- |
+| `wal.broker_endpoints` | Array | -- | The broker endpoints of the Kafka cluster. |
+| `wal.num_topics` | Integer | `64` | Number of topics to be created upon start. |
+| `wal.selector_type` | String | `round_robin` | Topic selector type.<br/>Available selector types:<br/>- `round_robin` (default) |
+| `wal.topic_name_prefix` | String | `greptimedb_wal_topic` | A Kafka topic is constructed by concatenating `topic_name_prefix` and `topic_id`. |
+| `wal.replication_factor` | Integer | `1` | Expected number of replicas of each partition. |
+| `wal.create_topic_timeout` | String | `30s` | Above which a topic creation operation will be cancelled. |
+| `wal.backoff_init` | String | `500ms` | The initial backoff for kafka clients. |
+| `wal.backoff_max` | String | `10s` | The maximum backoff for kafka clients. |
+| `wal.backoff_base` | Integer | `2` | Exponential backoff rate, i.e. next backoff = base * current backoff. |
+| `wal.backoff_deadline` | String | `5mins` | Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate. |
+| `logging` | -- | -- | The logging options. |
+| `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. |
+| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
+| `logging.otlp_endpoint` | String | `None` | The OTLP tracing endpoint. |
+| `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
+| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.<br/>Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.<br/>ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
+| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
+| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
+| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
+| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself |
+| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.remote_write` | -- | -- | -- |
+| `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`. |
+| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
+
+
+### Datanode
+
+| Key | Type | Default | Descriptions |
+| --- | -----| ------- | ----------- |
+| `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. |
+| `node_id` | Integer | `None` | The datanode identifier and should be unique in the cluster. |
+| `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. |
+| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
+| `rpc_addr` | String | `127.0.0.1:3001` | The gRPC address of the datanode. |
+| `rpc_hostname` | String | `None` | The hostname of the datanode. |
+| `rpc_runtime_size` | Integer | `8` | The number of gRPC server worker threads. |
+| `rpc_max_recv_message_size` | String | `512MB` | The maximum receive message size for gRPC server. |
+| `rpc_max_send_message_size` | String | `512MB` | The maximum send message size for gRPC server. |
+| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. |
+| `heartbeat` | -- | -- | The heartbeat options. |
+| `heartbeat.interval` | String | `3s` | Interval for sending heartbeat messages to the metasrv. |
+| `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. |
+| `meta_client` | -- | -- | The metasrv client options. |
+| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
+| `meta_client.timeout` | String | `3s` | Operation timeout. |
+| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
+| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
+| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
+| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
+| `meta_client.metadata_cache_max_capacity` | Integer | `100000` | The configuration about the cache of the metadata. |
+| `meta_client.metadata_cache_ttl` | String | `10m` | TTL of the metadata cache. |
+| `meta_client.metadata_cache_tti` | String | `5m` | -- |
+| `wal` | -- | -- | The WAL options. |
+| `wal.provider` | String | `raft_engine` | The provider of the WAL.<br/>- `raft_engine`: the wal is stored in the local file system by raft-engine.<br/>- `kafka`: it's remote wal that data is stored in Kafka. |
+| `wal.dir` | String | `None` | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.file_size` | String | `256MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_threshold` | String | `4GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_interval` | String | `10m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.read_batch_size` | Integer | `128` | The read batch size.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.sync_write` | Bool | `false` | Whether to use sync write.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.enable_log_recycle` | Bool | `true` | Whether to reuse logically truncated log files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.prefill_log_files` | Bool | `false` | Whether to pre-create log files on start up.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.sync_period` | String | `10s` | Duration for fsyncing log files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.broker_endpoints` | Array | -- | The Kafka broker endpoints.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.max_batch_size` | String | `1MB` | The max size of a single producer batch.<br/>Warning: Kafka has a default limit of 1MB per message in a topic.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.linger` | String | `200ms` | The linger duration of a kafka batch producer.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.consumer_wait_timeout` | String | `100ms` | The consumer wait timeout.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_init` | String | `500ms` | The initial backoff delay.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_max` | String | `10s` | The maximum backoff delay.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_base` | Integer | `2` | The exponential backoff rate, i.e. next backoff = base * current backoff.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.backoff_deadline` | String | `5mins` | The deadline of retries.<br/>**It's only used when the provider is `kafka`**. |
+| `storage` | -- | -- | The data storage options. |
+| `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
+| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
+| `storage.cache_path` | String | `None` | Cache configuration for object storage such as 'S3' etc.<br/>The local file cache directory. |
+| `storage.cache_capacity` | String | `None` | The local file cache capacity in bytes. |
+| `storage.bucket` | String | `None` | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
+| `storage.root` | String | `None` | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
+| `storage.access_key_id` | String | `None` | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
+| `storage.secret_access_key` | String | `None` | The secret access key of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3`**. |
+| `storage.access_key_secret` | String | `None` | The secret access key of the aliyun account.<br/>**It's only used when the storage type is `Oss`**. |
+| `storage.account_name` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.account_key` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.scope` | String | `None` | The scope of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.credential_path` | String | `None` | The credential path of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.container` | String | `None` | The container of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.sas_token` | String | `None` | The sas token of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.endpoint` | String | `None` | The endpoint of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `storage.region` | String | `None` | The region of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `[[region_engine]]` | -- | -- | The region engine options. You can configure multiple region engines. |
+| `region_engine.mito` | -- | -- | The Mito engine options. |
+| `region_engine.mito.num_workers` | Integer | `8` | Number of region workers. |
+| `region_engine.mito.worker_channel_size` | Integer | `128` | Request channel size of each worker. |
+| `region_engine.mito.worker_request_batch_size` | Integer | `64` | Max batch size for a worker to handle requests. |
+| `region_engine.mito.manifest_checkpoint_distance` | Integer | `10` | Number of meta action updated to trigger a new checkpoint for the manifest. |
+| `region_engine.mito.compress_manifest` | Bool | `false` | Whether to compress manifest and checkpoint file by gzip (default false). |
+| `region_engine.mito.max_background_jobs` | Integer | `4` | Max number of running background jobs |
+| `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
+| `region_engine.mito.global_write_buffer_size` | String | `1GB` | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
+| `region_engine.mito.global_write_buffer_reject_size` | String | `2GB` | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
+| `region_engine.mito.sst_meta_cache_size` | String | `128MB` | Cache size for SST metadata. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
+| `region_engine.mito.vector_cache_size` | String | `512MB` | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.page_cache_size` | String | `512MB` | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
+| `region_engine.mito.scan_parallelism` | Integer | `0` | Parallelism to scan a region (default: 1/4 of cpu cores).<br/>- `0`: using the default value (1/4 of cpu cores).<br/>- `1`: scan in current thread.<br/>- `n`: scan in parallelism n. |
+| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
+| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
+| `region_engine.mito.inverted_index` | -- | -- | The options for inverted index in Mito engine. |
+| `region_engine.mito.inverted_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically<br/>- `disable`: never |
+| `region_engine.mito.inverted_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically<br/>- `disable`: never |
+| `region_engine.mito.inverted_index.apply_on_query` | String | `auto` | Whether to apply the index on query<br/>- `auto`: automatically<br/>- `disable`: never |
+| `region_engine.mito.inverted_index.mem_threshold_on_create` | String | `64M` | Memory threshold for performing an external sort during index creation.<br/>Setting to empty will disable external sorting, forcing all sorting operations to happen in memory. |
+| `region_engine.mito.inverted_index.intermediate_path` | String | `""` | File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`). |
+| `region_engine.mito.memtable` | -- | -- | -- |
+| `region_engine.mito.memtable.type` | String | `time_series` | Memtable type.<br/>- `time_series`: time-series memtable<br/>- `partition_tree`: partition tree memtable (experimental) |
+| `region_engine.mito.memtable.index_max_keys_per_shard` | Integer | `8192` | The max number of keys in one shard.<br/>Only available for `partition_tree` memtable. |
+| `region_engine.mito.memtable.data_freeze_threshold` | Integer | `32768` | The max rows of data inside the actively writing buffer in one shard.<br/>Only available for `partition_tree` memtable. |
+| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
+| `logging` | -- | -- | The logging options. |
+| `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. |
+| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
+| `logging.otlp_endpoint` | String | `None` | The OTLP tracing endpoint. |
+| `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
+| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.<br/>Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.<br/>ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
+| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
+| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
+| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
+| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself |
+| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.remote_write` | -- | -- | -- |
+| `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`. |
+| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -1,171 +1,430 @@
-# Node running mode, see `standalone.example.toml`.
-mode = "distributed"
-# The datanode identifier, should be unique.
+## The running mode of the datanode. It can be `standalone` or `distributed`.
+mode = "standalone"
+
+## The datanode identifier and should be unique in the cluster.
+## +toml2docs:none-default
 node_id = 42
-# gRPC server address, "127.0.0.1:3001" by default.
-rpc_addr = "127.0.0.1:3001"
-# Hostname of this node.
-rpc_hostname = "127.0.0.1"
-# The number of gRPC server worker threads, 8 by default.
-rpc_runtime_size = 8
-# Start services after regions have obtained leases.
-# It will block the datanode start if it can't receive leases in the heartbeat from metasrv.
+
+## Start services after regions have obtained leases.
+## It will block the datanode start if it can't receive leases in the heartbeat from metasrv.
 require_lease_before_startup = false

-# Initialize all regions in the background during the startup.
-# By default, it provides services after all regions have been initialized.
+## Initialize all regions in the background during the startup.
+## By default, it provides services after all regions have been initialized.
 init_regions_in_background = false

+## The gRPC address of the datanode.
+rpc_addr = "127.0.0.1:3001"
+
+## The hostname of the datanode.
+## +toml2docs:none-default
+rpc_hostname = "127.0.0.1"
+
+## The number of gRPC server worker threads.
+rpc_runtime_size = 8
+
+## The maximum receive message size for gRPC server.
+rpc_max_recv_message_size = "512MB"
+
+## The maximum send message size for gRPC server.
+rpc_max_send_message_size = "512MB"
+
+## Enable telemetry to collect anonymous usage data.
+enable_telemetry = true
+
+## The heartbeat options.
 [heartbeat]
-# Interval for sending heartbeat messages to the Metasrv, 3 seconds by default.
+## Interval for sending heartbeat messages to the metasrv.
 interval = "3s"

-# Metasrv client options.
+## Interval for retrying to send heartbeat messages to the metasrv.
+retry_interval = "3s"
+
+## The metasrv client options.
 [meta_client]
-# Metasrv address list.
+## The addresses of the metasrv.
 metasrv_addrs = ["127.0.0.1:3002"]
-# Heartbeat timeout, 500 milliseconds by default.
-heartbeat_timeout = "500ms"
-# Operation timeout, 3 seconds by default.
+
+## Operation timeout.
 timeout = "3s"
-# Connect server timeout, 1 second by default.
+
+## Heartbeat timeout.
+heartbeat_timeout = "500ms"
+
+## DDL timeout.
+ddl_timeout = "10s"
+
+## Connect server timeout.
 connect_timeout = "1s"
-# `TCP_NODELAY` option for accepted connections, true by default.
+
+## `TCP_NODELAY` option for accepted connections.
 tcp_nodelay = true

-# WAL options.
+## The configuration about the cache of the metadata.
+metadata_cache_max_capacity = 100000
+
+## TTL of the metadata cache.
+metadata_cache_ttl = "10m"
+
+# TTI of the metadata cache.
+metadata_cache_tti = "5m"
+
+## The WAL options.
 [wal]
+## The provider of the WAL.
+## - `raft_engine`: the wal is stored in the local file system by raft-engine.
+## - `kafka`: it's remote wal that data is stored in Kafka.
 provider = "raft_engine"

-# Raft-engine wal options, see `standalone.example.toml`.
-# dir = "/tmp/greptimedb/wal"
+## The directory to store the WAL files.
+## **It's only used when the provider is `raft_engine`**.
+## +toml2docs:none-default
+dir = "/tmp/greptimedb/wal"
+
+## The size of the WAL segment file.
+## **It's only used when the provider is `raft_engine`**.
 file_size = "256MB"
+
+## The threshold of the WAL size to trigger a flush.
+## **It's only used when the provider is `raft_engine`**.
 purge_threshold = "4GB"
+
+## The interval to trigger a flush.
+## **It's only used when the provider is `raft_engine`**.
 purge_interval = "10m"
+
+## The read batch size.
+## **It's only used when the provider is `raft_engine`**.
 read_batch_size = 128
+
+## Whether to use sync write.
+## **It's only used when the provider is `raft_engine`**.
 sync_write = false

-# Kafka wal options, see `standalone.example.toml`.
-# broker_endpoints = ["127.0.0.1:9092"]
-# Warning: Kafka has a default limit of 1MB per message in a topic.
-# max_batch_size = "1MB"
-# linger = "200ms"
-# consumer_wait_timeout = "100ms"
-# backoff_init = "500ms"
-# backoff_max = "10s"
-# backoff_base = 2
-# backoff_deadline = "5mins"
+## Whether to reuse logically truncated log files.
+## **It's only used when the provider is `raft_engine`**.
+enable_log_recycle = true

-# Storage options, see `standalone.example.toml`.
+## Whether to pre-create log files on start up.
+## **It's only used when the provider is `raft_engine`**.
+prefill_log_files = false
+
+## Duration for fsyncing log files.
+## **It's only used when the provider is `raft_engine`**.
+sync_period = "10s"
+
+## The Kafka broker endpoints.
+## **It's only used when the provider is `kafka`**.
+broker_endpoints = ["127.0.0.1:9092"]
+
+## The max size of a single producer batch.
+## Warning: Kafka has a default limit of 1MB per message in a topic.
+## **It's only used when the provider is `kafka`**.
+max_batch_size = "1MB"
+
+## The linger duration of a kafka batch producer.
+## **It's only used when the provider is `kafka`**.
+linger = "200ms"
+
+## The consumer wait timeout.
+## **It's only used when the provider is `kafka`**.
+consumer_wait_timeout = "100ms"
+
+## The initial backoff delay.
+## **It's only used when the provider is `kafka`**.
+backoff_init = "500ms"
+
+## The maximum backoff delay.
+## **It's only used when the provider is `kafka`**.
+backoff_max = "10s"
+
+## The exponential backoff rate, i.e. next backoff = base * current backoff.
+## **It's only used when the provider is `kafka`**.
+backoff_base = 2
+
+## The deadline of retries.
+## **It's only used when the provider is `kafka`**.
+backoff_deadline = "5mins"
+
+# Example of using S3 as the storage.
+# [storage]
+# type = "S3"
+# bucket = "greptimedb"
+# root = "data"
+# access_key_id = "test"
+# secret_access_key = "123456"
+# endpoint = "https://s3.amazonaws.com"
+# region = "us-west-2"
+
+# Example of using Oss as the storage.
+# [storage]
+# type = "Oss"
+# bucket = "greptimedb"
+# root = "data"
+# access_key_id = "test"
+# access_key_secret = "123456"
+# endpoint = "https://oss-cn-hangzhou.aliyuncs.com"
+
+# Example of using Azblob as the storage.
+# [storage]
+# type = "Azblob"
+# container = "greptimedb"
+# root = "data"
+# account_name = "test"
+# account_key = "123456"
+# endpoint = "https://greptimedb.blob.core.windows.net"
+# sas_token = ""
+
+# Example of using Gcs as the storage.
+# [storage]
+# type = "Gcs"
+# bucket = "greptimedb"
+# root = "data"
+# scope = "test"
+# credential_path = "123456"
+# endpoint = "https://storage.googleapis.com"
+
+## The data storage options.
 [storage]
-# The working home directory.
+## The working home directory.
 data_home = "/tmp/greptimedb/"
-# Storage type.
-type = "File"
-# TTL for all tables. Disabled by default.
-# global_ttl = "7d"

-# Cache configuration for object storage such as 'S3' etc.
-# The local file cache directory
-# cache_path = "/path/local_cache"
-# The local file cache capacity in bytes.
-# cache_capacity = "256MB"
+## The storage type used to store the data.
+## - `File`: the data is stored in the local file system.
+## - `S3`: the data is stored in the S3 object storage.
+## - `Gcs`: the data is stored in the Google Cloud Storage.
+## - `Azblob`: the data is stored in the Azure Blob Storage.
+## - `Oss`: the data is stored in the Aliyun OSS.
+type = "File"
+
+## Cache configuration for object storage such as 'S3' etc.
+## The local file cache directory.
+## +toml2docs:none-default
+cache_path = "/path/local_cache"
+
+## The local file cache capacity in bytes.
+## +toml2docs:none-default
+cache_capacity = "256MB"
+
+## The S3 bucket name.
+## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
+## +toml2docs:none-default
+bucket = "greptimedb"
+
+## The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.
+## **It's only used when the storage type is `S3`, `Oss` and `Azblob`**.
+## +toml2docs:none-default
+root = "greptimedb"
+
+## The access key id of the aws account.
+## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
+## **It's only used when the storage type is `S3` and `Oss`**.
+## +toml2docs:none-default
+access_key_id = "test"
+
+## The secret access key of the aws account.
+## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
+## **It's only used when the storage type is `S3`**.
+## +toml2docs:none-default
+secret_access_key = "test"
+
+## The secret access key of the aliyun account.
+## **It's only used when the storage type is `Oss`**.
+## +toml2docs:none-default
+access_key_secret = "test"
+
+## The account key of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+account_name = "test"
+
+## The account key of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+account_key = "test"
+
+## The scope of the google cloud storage.
+## **It's only used when the storage type is `Gcs`**.
+## +toml2docs:none-default
+scope = "test"
+
+## The credential path of the google cloud storage.
+## **It's only used when the storage type is `Gcs`**.
+## +toml2docs:none-default
+credential_path = "test"
+
+## The container of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+container = "greptimedb"
+
+## The sas token of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+sas_token = ""
+
+## The endpoint of the S3 service.
+## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
+## +toml2docs:none-default
+endpoint = "https://s3.amazonaws.com"
+
+## The region of the S3 service.
+## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
+## +toml2docs:none-default
+region = "us-west-2"

 # Custom storage options
-#[[storage.providers]]
-#type = "S3"
-#[[storage.providers]]
-#type = "Gcs"
+# [[storage.providers]]
+# type = "S3"
+# [[storage.providers]]
+# type = "Gcs"

-# Mito engine options
+## The region engine options. You can configure multiple region engines.
 [[region_engine]]
+
+## The Mito engine options.
 [region_engine.mito]
-# Number of region workers
+
+## Number of region workers.
 num_workers = 8
-# Request channel size of each worker
+
+## Request channel size of each worker.
 worker_channel_size = 128
-# Max batch size for a worker to handle requests
+
+## Max batch size for a worker to handle requests.
 worker_request_batch_size = 64
-# Number of meta action updated to trigger a new checkpoint for the manifest
+
+## Number of meta action updated to trigger a new checkpoint for the manifest.
 manifest_checkpoint_distance = 10
-# Whether to compress manifest and checkpoint file by gzip (default false).
+
+## Whether to compress manifest and checkpoint file by gzip (default false).
 compress_manifest = false
-# Max number of running background jobs
+
+## Max number of running background jobs
 max_background_jobs = 4
-# Interval to auto flush a region if it has not flushed yet.
+
+## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"
-# Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB.
+
+## Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB.
 global_write_buffer_size = "1GB"
-# Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`
+
+## Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`
 global_write_buffer_reject_size = "2GB"
-# Cache size for SST metadata. Setting it to 0 to disable the cache.
-# If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+
+## Cache size for SST metadata. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
 sst_meta_cache_size = "128MB"
-# Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.
-# If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+
+## Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
 vector_cache_size = "512MB"
-# Cache size for pages of SST row groups. Setting it to 0 to disable the cache.
-# If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+
+## Cache size for pages of SST row groups. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
 page_cache_size = "512MB"
-# Buffer size for SST writing.
+
+## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
-# Parallelism to scan a region (default: 1/4 of cpu cores).
-# - 0: using the default value (1/4 of cpu cores).
-# - 1: scan in current thread.
-# - n: scan in parallelism n.
+
+## Parallelism to scan a region (default: 1/4 of cpu cores).
+## - `0`: using the default value (1/4 of cpu cores).
+## - `1`: scan in current thread.
+## - `n`: scan in parallelism n.
 scan_parallelism = 0
-# Capacity of the channel to send data from parallel scan tasks to the main task (default 32).
+
+## Capacity of the channel to send data from parallel scan tasks to the main task.
 parallel_scan_channel_size = 32
-# Whether to allow stale WAL entries read during replay.
+
+## Whether to allow stale WAL entries read during replay.
 allow_stale_entries = false

+## The options for inverted index in Mito engine.
 [region_engine.mito.inverted_index]
-# Whether to create the index on flush.
-# - "auto": automatically
-# - "disable": never
+
+## Whether to create the index on flush.
+## - `auto`: automatically
+## - `disable`: never
 create_on_flush = "auto"
-# Whether to create the index on compaction.
-# - "auto": automatically
-# - "disable": never
+
+## Whether to create the index on compaction.
+## - `auto`: automatically
+## - `disable`: never
 create_on_compaction = "auto"
-# Whether to apply the index on query
-# - "auto": automatically
-# - "disable": never
+
+## Whether to apply the index on query
+## - `auto`: automatically
+## - `disable`: never
 apply_on_query = "auto"
-# Memory threshold for performing an external sort during index creation.
-# Setting to empty will disable external sorting, forcing all sorting operations to happen in memory.
+
+## Memory threshold for performing an external sort during index creation.
+## Setting to empty will disable external sorting, forcing all sorting operations to happen in memory.
 mem_threshold_on_create = "64M"
-# File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
+
+## File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
 intermediate_path = ""

 [region_engine.mito.memtable]
-# Memtable type.
-# - "partition_tree": partition tree memtable
-# - "time_series": time-series memtable (deprecated)
-type = "partition_tree"
-# The max number of keys in one shard.
+## Memtable type.
+## - `time_series`: time-series memtable
+## - `partition_tree`: partition tree memtable (experimental)
+type = "time_series"
+
+## The max number of keys in one shard.
+## Only available for `partition_tree` memtable.
 index_max_keys_per_shard = 8192
-# The max rows of data inside the actively writing buffer in one shard.
+
+## The max rows of data inside the actively writing buffer in one shard.
+## Only available for `partition_tree` memtable.
 data_freeze_threshold = 32768
-# Max dictionary bytes.
+
+## Max dictionary bytes.
+## Only available for `partition_tree` memtable.
 fork_dictionary_bytes = "1GiB"

-# Log options, see `standalone.example.toml`
-# [logging]
-# dir = "/tmp/greptimedb/logs"
-# level = "info"
+## The logging options.
+[logging]
+## The directory to store the log files.
+dir = "/tmp/greptimedb/logs"

-# Datanode export the metrics generated by itself
-# encoded to Prometheus remote-write format
-# and send to Prometheus remote-write compatible receiver (e.g. send to `greptimedb` itself)
-# This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-# [export_metrics]
-# whether enable export metrics, default is false
-# enable = false
-# The interval of export metrics
-# write_interval = "30s"
-# [export_metrics.remote_write]
-# The url the metrics send to. The url is empty by default, url example: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`
-# url = ""
-# HTTP headers of Prometheus remote-write carry
-# headers = {}
+## The log level. Can be `info`/`debug`/`warn`/`error`.
+## +toml2docs:none-default
+level = "info"
+
+## Enable OTLP tracing.
+enable_otlp_tracing = false
+
+## The OTLP tracing endpoint.
+## +toml2docs:none-default
+otlp_endpoint = ""
+
+## Whether to append logs to stdout.
+append_stdout = true
+
+## The percentage of tracing will be sampled and exported.
+## Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
+## ratio > 1 are treated as 1. Fractions < 0 are treated as 0
+[logging.tracing_sample_ratio]
+default_ratio = 1.0
+
+## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
+## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
+[export_metrics]
+
+## whether enable export metrics.
+enable = false
+
+## The interval of export metrics.
+write_interval = "30s"
+
+## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
+[export_metrics.self_import]
+## +toml2docs:none-default
+db = "information_schema"
+
+[export_metrics.remote_write]
+## The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`.
+url = ""
+
+## HTTP headers of Prometheus remote-write carry.
+headers = { }
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -1,106 +1,192 @@
-# Node running mode, see `standalone.example.toml`.
-mode = "distributed"
-# The default timezone of the server
-# default_timezone = "UTC"
+## The running mode of the datanode. It can be `standalone` or `distributed`.
+mode = "standalone"

+## The default timezone of the server.
+## +toml2docs:none-default
+default_timezone = "UTC"
+
+## The heartbeat options.
 [heartbeat]
-# Interval for sending heartbeat task to the Metasrv, 5 seconds by default.
-interval = "5s"
-# Interval for retry sending heartbeat task, 5 seconds by default.
-retry_interval = "5s"
+## Interval for sending heartbeat messages to the metasrv.
+interval = "18s"

-# HTTP server options, see `standalone.example.toml`.
+## Interval for retrying to send heartbeat messages to the metasrv.
+retry_interval = "3s"
+
+## The HTTP server options.
 [http]
+## The address to bind the HTTP server.
 addr = "127.0.0.1:4000"
+## HTTP request timeout.
 timeout = "30s"
+## HTTP request body limit.
+## Support the following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
 body_limit = "64MB"

-# gRPC server options, see `standalone.example.toml`.
+## The gRPC server options.
 [grpc]
+## The address to bind the gRPC server.
 addr = "127.0.0.1:4001"
+## The number of server worker threads.
 runtime_size = 8

-# MySQL server options, see `standalone.example.toml`.
+## MySQL server options.
 [mysql]
+## Whether to enable.
 enable = true
+## The addr to bind the MySQL server.
 addr = "127.0.0.1:4002"
+## The number of server worker threads.
 runtime_size = 2

-# MySQL server TLS options, see `standalone.example.toml`.
+# MySQL server TLS options.
 [mysql.tls]
+
+## TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html
+## - `disable` (default value)
+## - `prefer`
+## - `require`
+## - `verify-ca`
+## - `verify-full`
 mode = "disable"
+
+## Certificate file path.
+## +toml2docs:none-default
 cert_path = ""
+
+## Private key file path.
+## +toml2docs:none-default
 key_path = ""
+
+## Watch for Certificate and key file change and auto reload
 watch = false

-# PostgresSQL server options, see `standalone.example.toml`.
+## PostgresSQL server options.
 [postgres]
+## Whether to enable
 enable = true
+## The addr to bind the PostgresSQL server.
 addr = "127.0.0.1:4003"
+## The number of server worker threads.
 runtime_size = 2

-# PostgresSQL server TLS options, see `standalone.example.toml`.
+## PostgresSQL server TLS options, see `mysql_options.tls` section.
 [postgres.tls]
+## TLS mode.
 mode = "disable"
+
+## Certificate file path.
+## +toml2docs:none-default
 cert_path = ""
+
+## Private key file path.
+## +toml2docs:none-default
 key_path = ""
+
+## Watch for Certificate and key file change and auto reload
 watch = false

-# OpenTSDB protocol options, see `standalone.example.toml`.
+## OpenTSDB protocol options.
 [opentsdb]
+## Whether to enable
 enable = true
+## OpenTSDB telnet API server address.
 addr = "127.0.0.1:4242"
+## The number of server worker threads.
 runtime_size = 2

-# InfluxDB protocol options, see `standalone.example.toml`.
+## InfluxDB protocol options.
 [influxdb]
+## Whether to enable InfluxDB protocol in HTTP API.
 enable = true

-# Prometheus remote storage options, see `standalone.example.toml`.
+## Prometheus remote storage options
 [prom_store]
+## Whether to enable Prometheus remote write and read in HTTP API.
 enable = true
-# Whether to store the data from Prometheus remote write in metric engine.
-# true by default
+## Whether to store the data from Prometheus remote write in metric engine.
 with_metric_engine = true

-# Metasrv client options, see `datanode.example.toml`.
+## The metasrv client options.
 [meta_client]
+## The addresses of the metasrv.
 metasrv_addrs = ["127.0.0.1:3002"]
+
+## Operation timeout.
 timeout = "3s"
-# DDL timeouts options.
+
+## Heartbeat timeout.
+heartbeat_timeout = "500ms"
+
+## DDL timeout.
 ddl_timeout = "10s"
+
+## Connect server timeout.
 connect_timeout = "1s"
+
+## `TCP_NODELAY` option for accepted connections.
 tcp_nodelay = true
-# The configuration about the cache of the Metadata.
-# default: 100000
+
+## The configuration about the cache of the metadata.
 metadata_cache_max_capacity = 100000
-# default: 10m
+
+## TTL of the metadata cache.
 metadata_cache_ttl = "10m"
-# default: 5m
+
+# TTI of the metadata cache.
 metadata_cache_tti = "5m"

-# Log options, see `standalone.example.toml`
-# [logging]
-# dir = "/tmp/greptimedb/logs"
-# level = "info"
-
-# Datanode options.
+## Datanode options.
 [datanode]
-# Datanode client options.
+## Datanode client options.
 [datanode.client]
 timeout = "10s"
 connect_timeout = "10s"
 tcp_nodelay = true

-# Frontend export the metrics generated by itself
-# encoded to Prometheus remote-write format
-# and send to Prometheus remote-write compatible receiver (e.g. send to `greptimedb` itself)
-# This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-# [export_metrics]
-# whether enable export metrics, default is false
-# enable = false
-# The interval of export metrics
-# write_interval = "30s"
-# for `frontend`, `self_import` is recommend to collect metrics generated by itself
-# [export_metrics.self_import]
-# db = "information_schema"
+## The logging options.
+[logging]
+## The directory to store the log files.
+dir = "/tmp/greptimedb/logs"
+
+## The log level. Can be `info`/`debug`/`warn`/`error`.
+## +toml2docs:none-default
+level = "info"
+
+## Enable OTLP tracing.
+enable_otlp_tracing = false
+
+## The OTLP tracing endpoint.
+## +toml2docs:none-default
+otlp_endpoint = ""
+
+## Whether to append logs to stdout.
+append_stdout = true
+
+## The percentage of tracing will be sampled and exported.
+## Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
+## ratio > 1 are treated as 1. Fractions < 0 are treated as 0
+[logging.tracing_sample_ratio]
+default_ratio = 1.0
+
+## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
+## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
+[export_metrics]
+
+## whether enable export metrics.
+enable = false
+
+## The interval of export metrics.
+write_interval = "30s"
+
+## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
+[export_metrics.self_import]
+## +toml2docs:none-default
+db = "information_schema"
+
+[export_metrics.remote_write]
+## The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`.
+url = ""
+
+## HTTP headers of Prometheus remote-write carry.
+headers = { }
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -1,39 +1,44 @@
-# The working home directory.
+## The working home directory.
 data_home = "/tmp/metasrv/"
-# The bind address of metasrv, "127.0.0.1:3002" by default.
+
+## The bind address of metasrv.
 bind_addr = "127.0.0.1:3002"
-# The communication server address for frontend and datanode to connect to metasrv,  "127.0.0.1:3002" by default for localhost.
+
+## The communication server address for frontend and datanode to connect to metasrv,  "127.0.0.1:3002" by default for localhost.
 server_addr = "127.0.0.1:3002"
-# Etcd server address, "127.0.0.1:2379" by default.
+
+## Etcd server address.
 store_addr = "127.0.0.1:2379"
-# Datanode selector type.
-# - "lease_based" (default value).
-# - "load_based"
-# For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector".
+
+## Datanode selector type.
+## - `lease_based` (default value).
+## - `load_based`
+## For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector".
 selector = "lease_based"
-# Store data in memory, false by default.
+
+## Store data in memory.
 use_memory_store = false
-# Whether to enable greptimedb telemetry, true by default.
+
+## Whether to enable greptimedb telemetry.
 enable_telemetry = true
-# If it's not empty, the metasrv will store all data with this key prefix.
+
+## If it's not empty, the metasrv will store all data with this key prefix.
 store_key_prefix = ""

-# Log options, see `standalone.example.toml`
-# [logging]
-# dir = "/tmp/greptimedb/logs"
-# level = "info"
-
-# Procedure storage options.
+## Procedure storage options.
 [procedure]
-# Procedure max retry time.
+
+## Procedure max retry time.
 max_retry_times = 12
-# Initial retry delay of procedures, increases exponentially
+
+## Initial retry delay of procedures, increases exponentially
 retry_delay = "500ms"
-# Auto split large value
-# GreptimeDB procedure uses etcd as the default metadata storage backend.
-# The etcd the maximum size of any request is 1.5 MiB
-# 1500KiB = 1536KiB (1.5MiB) - 36KiB (reserved size of key)
-# Comments out the `max_metadata_value_size`, for don't split large value (no limit).
+
+## Auto split large value
+## GreptimeDB procedure uses etcd as the default metadata storage backend.
+## The etcd the maximum size of any request is 1.5 MiB
+## 1500KiB = 1536KiB (1.5MiB) - 36KiB (reserved size of key)
+## Comments out the `max_metadata_value_size`, for don't split large value (no limit).
 max_metadata_value_size = "1500KiB"

 # Failure detectors options.
@@ -43,57 +48,96 @@ min_std_deviation = "100ms"
 acceptable_heartbeat_pause = "3000ms"
 first_heartbeat_estimate = "1000ms"

-# # Datanode options.
-# [datanode]
-# # Datanode client options.
-# [datanode.client_options]
-# timeout = "10s"
-# connect_timeout = "10s"
-# tcp_nodelay = true
+## Datanode options.
+[datanode]
+## Datanode client options.
+[datanode.client]
+timeout = "10s"
+connect_timeout = "10s"
+tcp_nodelay = true

 [wal]
 # Available wal providers:
-# - "raft_engine" (default)
-# - "kafka"
+# - `raft_engine` (default): there're none raft-engine wal config since metasrv only involves in remote wal currently.
+# - `kafka`: metasrv **have to be** configured with kafka wal config when using kafka wal provider in datanode.
 provider = "raft_engine"

-# There're none raft-engine wal config since meta srv only involves in remote wal currently.
-
 # Kafka wal config.
-# The broker endpoints of the Kafka cluster. ["127.0.0.1:9092"] by default.
-# broker_endpoints = ["127.0.0.1:9092"]
-# Number of topics to be created upon start.
-# num_topics = 64
-# Topic selector type.
-# Available selector types: 
-# - "round_robin" (default)
-# selector_type = "round_robin"
-# A Kafka topic is constructed by concatenating `topic_name_prefix` and `topic_id`.
-# topic_name_prefix = "greptimedb_wal_topic"
-# Expected number of replicas of each partition.
-# replication_factor = 1
-# Above which a topic creation operation will be cancelled.
-# create_topic_timeout = "30s"
-# The initial backoff for kafka clients.
-# backoff_init = "500ms"
-# The maximum backoff for kafka clients.
-# backoff_max = "10s"
-# Exponential backoff rate, i.e. next backoff = base * current backoff.
-# backoff_base = 2
-# Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate.
-# backoff_deadline = "5mins"

-# Metasrv export the metrics generated by itself
-# encoded to Prometheus remote-write format
-# and send to Prometheus remote-write compatible receiver (e.g. send to `greptimedb` itself)
-# This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-# [export_metrics]
-# whether enable export metrics, default is false
-# enable = false
-# The interval of export metrics
-# write_interval = "30s"
-# [export_metrics.remote_write]
-# The url the metrics send to. The url is empty by default, url example: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`
-# url = ""
-# HTTP headers of Prometheus remote-write carry
-# headers = {}
+## The broker endpoints of the Kafka cluster.
+broker_endpoints = ["127.0.0.1:9092"]
+
+## Number of topics to be created upon start.
+num_topics = 64
+
+## Topic selector type.
+## Available selector types:
+## - `round_robin` (default)
+selector_type = "round_robin"
+
+## A Kafka topic is constructed by concatenating `topic_name_prefix` and `topic_id`.
+topic_name_prefix = "greptimedb_wal_topic"
+
+## Expected number of replicas of each partition.
+replication_factor = 1
+
+## Above which a topic creation operation will be cancelled.
+create_topic_timeout = "30s"
+## The initial backoff for kafka clients.
+backoff_init = "500ms"
+
+## The maximum backoff for kafka clients.
+backoff_max = "10s"
+
+## Exponential backoff rate, i.e. next backoff = base * current backoff.
+backoff_base = 2
+
+## Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate.
+backoff_deadline = "5mins"
+
+## The logging options.
+[logging]
+## The directory to store the log files.
+dir = "/tmp/greptimedb/logs"
+
+## The log level. Can be `info`/`debug`/`warn`/`error`.
+## +toml2docs:none-default
+level = "info"
+
+## Enable OTLP tracing.
+enable_otlp_tracing = false
+
+## The OTLP tracing endpoint.
+## +toml2docs:none-default
+otlp_endpoint = ""
+
+## Whether to append logs to stdout.
+append_stdout = true
+
+## The percentage of tracing will be sampled and exported.
+## Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
+## ratio > 1 are treated as 1. Fractions < 0 are treated as 0
+[logging.tracing_sample_ratio]
+default_ratio = 1.0
+
+## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
+## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
+[export_metrics]
+
+## whether enable export metrics.
+enable = false
+
+## The interval of export metrics.
+write_interval = "30s"
+
+## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
+[export_metrics.self_import]
+## +toml2docs:none-default
+db = "information_schema"
+
+[export_metrics.remote_write]
+## The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`.
+url = ""
+
+## HTTP headers of Prometheus remote-write carry.
+headers = { }
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -1,286 +1,477 @@
-# Node running mode, "standalone" or "distributed".
+## The running mode of the datanode. It can be `standalone` or `distributed`.
 mode = "standalone"
-# Whether to enable greptimedb telemetry, true by default.
-enable_telemetry = true
-# The default timezone of the server
-# default_timezone = "UTC"

-# HTTP server options.
+## Enable telemetry to collect anonymous usage data.
+enable_telemetry = true
+
+## The default timezone of the server.
+## +toml2docs:none-default
+default_timezone = "UTC"
+
+## The HTTP server options.
 [http]
-# Server address, "127.0.0.1:4000" by default.
+## The address to bind the HTTP server.
 addr = "127.0.0.1:4000"
-# HTTP request timeout, 30s by default.
+## HTTP request timeout.
 timeout = "30s"
-# HTTP request body limit, 64Mb by default.
-# the following units are supported: B, KB, KiB, MB, MiB, GB, GiB, TB, TiB, PB, PiB
+## HTTP request body limit.
+## Support the following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
 body_limit = "64MB"

-# gRPC server options.
+## The gRPC server options.
 [grpc]
-# Server address, "127.0.0.1:4001" by default.
+## The address to bind the gRPC server.
 addr = "127.0.0.1:4001"
-# The number of server worker threads, 8 by default.
+## The number of server worker threads.
 runtime_size = 8

-# MySQL server options.
+## MySQL server options.
 [mysql]
-# Whether to enable
+## Whether to enable.
 enable = true
-# Server address, "127.0.0.1:4002" by default.
+## The addr to bind the MySQL server.
 addr = "127.0.0.1:4002"
-# The number of server worker threads, 2 by default.
+## The number of server worker threads.
 runtime_size = 2

 # MySQL server TLS options.
 [mysql.tls]
-# TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html
-# - "disable" (default value)
-# - "prefer"
-# - "require"
-# - "verify-ca"
-# - "verify-full"
+
+## TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html
+## - `disable` (default value)
+## - `prefer`
+## - `require`
+## - `verify-ca`
+## - `verify-full`
 mode = "disable"
-# Certificate file path.
+
+## Certificate file path.
+## +toml2docs:none-default
 cert_path = ""
-# Private key file path.
+
+## Private key file path.
+## +toml2docs:none-default
 key_path = ""
-# Watch for Certificate and key file change and auto reload
+
+## Watch for Certificate and key file change and auto reload
 watch = false

-# PostgresSQL server options.
+## PostgresSQL server options.
 [postgres]
-# Whether to enable
+## Whether to enable
 enable = true
-# Server address, "127.0.0.1:4003" by default.
+## The addr to bind the PostgresSQL server.
 addr = "127.0.0.1:4003"
-# The number of server worker threads, 2 by default.
+## The number of server worker threads.
 runtime_size = 2

-# PostgresSQL server TLS options, see `[mysql_options.tls]` section.
+## PostgresSQL server TLS options, see `mysql_options.tls` section.
 [postgres.tls]
-# TLS mode.
+## TLS mode.
 mode = "disable"
-# certificate file path.
+
+## Certificate file path.
+## +toml2docs:none-default
 cert_path = ""
-# private key file path.
+
+## Private key file path.
+## +toml2docs:none-default
 key_path = ""
-# Watch for Certificate and key file change and auto reload
+
+## Watch for Certificate and key file change and auto reload
 watch = false

-# OpenTSDB protocol options.
+## OpenTSDB protocol options.
 [opentsdb]
-# Whether to enable
+## Whether to enable
 enable = true
-# OpenTSDB telnet API server address, "127.0.0.1:4242" by default.
+## OpenTSDB telnet API server address.
 addr = "127.0.0.1:4242"
-# The number of server worker threads, 2 by default.
+## The number of server worker threads.
 runtime_size = 2

-# InfluxDB protocol options.
+## InfluxDB protocol options.
 [influxdb]
-# Whether to enable InfluxDB protocol in HTTP API, true by default.
+## Whether to enable InfluxDB protocol in HTTP API.
 enable = true

-# Prometheus remote storage options
+## Prometheus remote storage options
 [prom_store]
-# Whether to enable Prometheus remote write and read in HTTP API, true by default.
+## Whether to enable Prometheus remote write and read in HTTP API.
 enable = true
-# Whether to store the data from Prometheus remote write in metric engine.
-# true by default
+## Whether to store the data from Prometheus remote write in metric engine.
 with_metric_engine = true

+## The WAL options.
 [wal]
-# Available wal providers:
-# - "raft_engine" (default)
-# - "kafka"
+## The provider of the WAL.
+## - `raft_engine`: the wal is stored in the local file system by raft-engine.
+## - `kafka`: it's remote wal that data is stored in Kafka.
 provider = "raft_engine"

-# Raft-engine wal options.
-# WAL data directory
-# dir = "/tmp/greptimedb/wal"
-# WAL file size in bytes.
+## The directory to store the WAL files.
+## **It's only used when the provider is `raft_engine`**.
+## +toml2docs:none-default
+dir = "/tmp/greptimedb/wal"
+
+## The size of the WAL segment file.
+## **It's only used when the provider is `raft_engine`**.
 file_size = "256MB"
-# WAL purge threshold.
+
+## The threshold of the WAL size to trigger a flush.
+## **It's only used when the provider is `raft_engine`**.
 purge_threshold = "4GB"
-# WAL purge interval in seconds.
+
+## The interval to trigger a flush.
+## **It's only used when the provider is `raft_engine`**.
 purge_interval = "10m"
-# WAL read batch size.
+
+## The read batch size.
+## **It's only used when the provider is `raft_engine`**.
 read_batch_size = 128
-# Whether to sync log file after every write.
+
+## Whether to use sync write.
+## **It's only used when the provider is `raft_engine`**.
 sync_write = false
-# Whether to reuse logically truncated log files.
+
+## Whether to reuse logically truncated log files.
+## **It's only used when the provider is `raft_engine`**.
 enable_log_recycle = true
-# Whether to pre-create log files on start up
+
+## Whether to pre-create log files on start up.
+## **It's only used when the provider is `raft_engine`**.
 prefill_log_files = false
-# Duration for fsyncing log files.
-sync_period = "1000ms"

-# Kafka wal options.
-# The broker endpoints of the Kafka cluster. ["127.0.0.1:9092"] by default.
-# broker_endpoints = ["127.0.0.1:9092"]
+## Duration for fsyncing log files.
+## **It's only used when the provider is `raft_engine`**.
+sync_period = "10s"

-# Number of topics to be created upon start.
-# num_topics = 64
-# Topic selector type.
-# Available selector types:
-# - "round_robin" (default)
-# selector_type = "round_robin"
-# The prefix of topic name.
-# topic_name_prefix = "greptimedb_wal_topic"
-# The number of replicas of each partition.
-# Warning: the replication factor must be positive and must not be greater than the number of broker endpoints.
-# replication_factor = 1
+## The Kafka broker endpoints.
+## **It's only used when the provider is `kafka`**.
+broker_endpoints = ["127.0.0.1:9092"]

-# The max size of a single producer batch.
-# Warning: Kafka has a default limit of 1MB per message in a topic.
-# max_batch_size = "1MB"
-# The linger duration.
-# linger = "200ms"
-# The consumer wait timeout.
-# consumer_wait_timeout = "100ms"
-# Create topic timeout.
-# create_topic_timeout = "30s"
+## The max size of a single producer batch.
+## Warning: Kafka has a default limit of 1MB per message in a topic.
+## **It's only used when the provider is `kafka`**.
+max_batch_size = "1MB"

-# The initial backoff delay.
-# backoff_init = "500ms"
-# The maximum backoff delay.
-# backoff_max = "10s"
-# Exponential backoff rate, i.e. next backoff = base * current backoff.
-# backoff_base = 2
-# The deadline of retries.
-# backoff_deadline = "5mins"
+## The linger duration of a kafka batch producer.
+## **It's only used when the provider is `kafka`**.
+linger = "200ms"

-# Metadata storage options.
+## The consumer wait timeout.
+## **It's only used when the provider is `kafka`**.
+consumer_wait_timeout = "100ms"
+
+## The initial backoff delay.
+## **It's only used when the provider is `kafka`**.
+backoff_init = "500ms"
+
+## The maximum backoff delay.
+## **It's only used when the provider is `kafka`**.
+backoff_max = "10s"
+
+## The exponential backoff rate, i.e. next backoff = base * current backoff.
+## **It's only used when the provider is `kafka`**.
+backoff_base = 2
+
+## The deadline of retries.
+## **It's only used when the provider is `kafka`**.
+backoff_deadline = "5mins"
+
+## Metadata storage options.
 [metadata_store]
-# Kv file size in bytes.
+## Kv file size in bytes.
 file_size = "256MB"
-# Kv purge threshold.
+## Kv purge threshold.
 purge_threshold = "4GB"

-# Procedure storage options.
+## Procedure storage options.
 [procedure]
-# Procedure max retry time.
+## Procedure max retry time.
 max_retry_times = 3
-# Initial retry delay of procedures, increases exponentially
+## Initial retry delay of procedures, increases exponentially
 retry_delay = "500ms"

-# Storage options.
+# Example of using S3 as the storage.
+# [storage]
+# type = "S3"
+# bucket = "greptimedb"
+# root = "data"
+# access_key_id = "test"
+# secret_access_key = "123456"
+# endpoint = "https://s3.amazonaws.com"
+# region = "us-west-2"
+
+# Example of using Oss as the storage.
+# [storage]
+# type = "Oss"
+# bucket = "greptimedb"
+# root = "data"
+# access_key_id = "test"
+# access_key_secret = "123456"
+# endpoint = "https://oss-cn-hangzhou.aliyuncs.com"
+
+# Example of using Azblob as the storage.
+# [storage]
+# type = "Azblob"
+# container = "greptimedb"
+# root = "data"
+# account_name = "test"
+# account_key = "123456"
+# endpoint = "https://greptimedb.blob.core.windows.net"
+# sas_token = ""
+
+# Example of using Gcs as the storage.
+# [storage]
+# type = "Gcs"
+# bucket = "greptimedb"
+# root = "data"
+# scope = "test"
+# credential_path = "123456"
+# endpoint = "https://storage.googleapis.com"
+
+## The data storage options.
 [storage]
-# The working home directory.
+## The working home directory.
 data_home = "/tmp/greptimedb/"
-# Storage type.
+
+## The storage type used to store the data.
+## - `File`: the data is stored in the local file system.
+## - `S3`: the data is stored in the S3 object storage.
+## - `Gcs`: the data is stored in the Google Cloud Storage.
+## - `Azblob`: the data is stored in the Azure Blob Storage.
+## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"
-# TTL for all tables. Disabled by default.
-# global_ttl = "7d"
-# Cache configuration for object storage such as 'S3' etc.
-# cache_path = "/path/local_cache"
-# The local file cache capacity in bytes.
-# cache_capacity = "256MB"
+
+## Cache configuration for object storage such as 'S3' etc.
+## The local file cache directory.
+## +toml2docs:none-default
+cache_path = "/path/local_cache"
+
+## The local file cache capacity in bytes.
+## +toml2docs:none-default
+cache_capacity = "256MB"
+
+## The S3 bucket name.
+## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
+## +toml2docs:none-default
+bucket = "greptimedb"
+
+## The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.
+## **It's only used when the storage type is `S3`, `Oss` and `Azblob`**.
+## +toml2docs:none-default
+root = "greptimedb"
+
+## The access key id of the aws account.
+## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
+## **It's only used when the storage type is `S3` and `Oss`**.
+## +toml2docs:none-default
+access_key_id = "test"
+
+## The secret access key of the aws account.
+## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
+## **It's only used when the storage type is `S3`**.
+## +toml2docs:none-default
+secret_access_key = "test"
+
+## The secret access key of the aliyun account.
+## **It's only used when the storage type is `Oss`**.
+## +toml2docs:none-default
+access_key_secret = "test"
+
+## The account key of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+account_name = "test"
+
+## The account key of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+account_key = "test"
+
+## The scope of the google cloud storage.
+## **It's only used when the storage type is `Gcs`**.
+## +toml2docs:none-default
+scope = "test"
+
+## The credential path of the google cloud storage.
+## **It's only used when the storage type is `Gcs`**.
+## +toml2docs:none-default
+credential_path = "test"
+
+## The container of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+container = "greptimedb"
+
+## The sas token of the azure account.
+## **It's only used when the storage type is `Azblob`**.
+## +toml2docs:none-default
+sas_token = ""
+
+## The endpoint of the S3 service.
+## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
+## +toml2docs:none-default
+endpoint = "https://s3.amazonaws.com"
+
+## The region of the S3 service.
+## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
+## +toml2docs:none-default
+region = "us-west-2"

 # Custom storage options
-#[[storage.providers]]
-#type = "S3"
-#[[storage.providers]]
-#type = "Gcs"
+# [[storage.providers]]
+# type = "S3"
+# [[storage.providers]]
+# type = "Gcs"

-# Mito engine options
+## The region engine options. You can configure multiple region engines.
 [[region_engine]]
+
+## The Mito engine options.
 [region_engine.mito]
-# Number of region workers
+
+## Number of region workers.
 num_workers = 8
-# Request channel size of each worker
+
+## Request channel size of each worker.
 worker_channel_size = 128
-# Max batch size for a worker to handle requests
+
+## Max batch size for a worker to handle requests.
 worker_request_batch_size = 64
-# Number of meta action updated to trigger a new checkpoint for the manifest
+
+## Number of meta action updated to trigger a new checkpoint for the manifest.
 manifest_checkpoint_distance = 10
-# Whether to compress manifest and checkpoint file by gzip (default false).
+
+## Whether to compress manifest and checkpoint file by gzip (default false).
 compress_manifest = false
-# Max number of running background jobs
+
+## Max number of running background jobs
 max_background_jobs = 4
-# Interval to auto flush a region if it has not flushed yet.
+
+## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"
-# Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB.
+
+## Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB.
 global_write_buffer_size = "1GB"
-# Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`
+
+## Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`
 global_write_buffer_reject_size = "2GB"
-# Cache size for SST metadata. Setting it to 0 to disable the cache.
-# If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+
+## Cache size for SST metadata. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
 sst_meta_cache_size = "128MB"
-# Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.
-# If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+
+## Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
 vector_cache_size = "512MB"
-# Cache size for pages of SST row groups. Setting it to 0 to disable the cache.
-# If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+
+## Cache size for pages of SST row groups. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
 page_cache_size = "512MB"
-# Buffer size for SST writing.
+
+## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
-# Parallelism to scan a region (default: 1/4 of cpu cores).
-# - 0: using the default value (1/4 of cpu cores).
-# - 1: scan in current thread.
-# - n: scan in parallelism n.
+
+## Parallelism to scan a region (default: 1/4 of cpu cores).
+## - `0`: using the default value (1/4 of cpu cores).
+## - `1`: scan in current thread.
+## - `n`: scan in parallelism n.
 scan_parallelism = 0
-# Capacity of the channel to send data from parallel scan tasks to the main task (default 32).
+
+## Capacity of the channel to send data from parallel scan tasks to the main task.
 parallel_scan_channel_size = 32
-# Whether to allow stale WAL entries read during replay.
+
+## Whether to allow stale WAL entries read during replay.
 allow_stale_entries = false

+## The options for inverted index in Mito engine.
 [region_engine.mito.inverted_index]
-# Whether to create the index on flush.
-# - "auto": automatically
-# - "disable": never
+
+## Whether to create the index on flush.
+## - `auto`: automatically
+## - `disable`: never
 create_on_flush = "auto"
-# Whether to create the index on compaction.
-# - "auto": automatically
-# - "disable": never
+
+## Whether to create the index on compaction.
+## - `auto`: automatically
+## - `disable`: never
 create_on_compaction = "auto"
-# Whether to apply the index on query
-# - "auto": automatically
-# - "disable": never
+
+## Whether to apply the index on query
+## - `auto`: automatically
+## - `disable`: never
 apply_on_query = "auto"
-# Memory threshold for performing an external sort during index creation.
-# Setting to empty will disable external sorting, forcing all sorting operations to happen in memory.
+
+## Memory threshold for performing an external sort during index creation.
+## Setting to empty will disable external sorting, forcing all sorting operations to happen in memory.
 mem_threshold_on_create = "64M"
-# File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
+
+## File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
 intermediate_path = ""

 [region_engine.mito.memtable]
-# Memtable type.
-# - "partition_tree": partition tree memtable
-# - "time_series": time-series memtable (deprecated)
-type = "partition_tree"
-# The max number of keys in one shard.
+## Memtable type.
+## - `time_series`: time-series memtable
+## - `partition_tree`: partition tree memtable (experimental)
+type = "time_series"
+
+## The max number of keys in one shard.
+## Only available for `partition_tree` memtable.
 index_max_keys_per_shard = 8192
-# The max rows of data inside the actively writing buffer in one shard.
+
+## The max rows of data inside the actively writing buffer in one shard.
+## Only available for `partition_tree` memtable.
 data_freeze_threshold = 32768
-# Max dictionary bytes.
+
+## Max dictionary bytes.
+## Only available for `partition_tree` memtable.
 fork_dictionary_bytes = "1GiB"

-# Log options
-# [logging]
-# Specify logs directory.
-# dir = "/tmp/greptimedb/logs"
-# Specify the log level [info | debug | error | warn]
-# level = "info"
-# whether enable tracing, default is false
-# enable_otlp_tracing = false
-# tracing exporter endpoint with format `ip:port`, we use grpc oltp as exporter, default endpoint is `localhost:4317`
-# otlp_endpoint = "localhost:4317"
-# Whether to append logs to stdout. Defaults to true.
-# append_stdout = true
-# The percentage of tracing will be sampled and exported. Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1. ratio > 1 are treated as 1. Fractions < 0 are treated as 0
-# [logging.tracing_sample_ratio]
-# default_ratio = 0.0
+## The logging options.
+[logging]
+## The directory to store the log files.
+dir = "/tmp/greptimedb/logs"

-# Standalone export the metrics generated by itself
-# encoded to Prometheus remote-write format
-# and send to Prometheus remote-write compatible receiver (e.g. send to `greptimedb` itself)
-# This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-# [export_metrics]
-# whether enable export metrics, default is false
-# enable = false
-# The interval of export metrics
-# write_interval = "30s"
-# for `standalone`, `self_import` is recommend to collect metrics generated by itself
-# [export_metrics.self_import]
-# db = "information_schema"
+## The log level. Can be `info`/`debug`/`warn`/`error`.
+## +toml2docs:none-default
+level = "info"
+
+## Enable OTLP tracing.
+enable_otlp_tracing = false
+
+## The OTLP tracing endpoint.
+## +toml2docs:none-default
+otlp_endpoint = ""
+
+## Whether to append logs to stdout.
+append_stdout = true
+
+## The percentage of tracing will be sampled and exported.
+## Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
+## ratio > 1 are treated as 1. Fractions < 0 are treated as 0
+[logging.tracing_sample_ratio]
+default_ratio = 1.0
+
+## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
+## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
+[export_metrics]
+
+## whether enable export metrics.
+enable = false
+
+## The interval of export metrics.
+write_interval = "30s"
+
+## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
+[export_metrics.self_import]
+## +toml2docs:none-default
+db = "information_schema"
+
+[export_metrics.remote_write]
+## The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`.
+url = ""
+
+## HTTP headers of Prometheus remote-write carry.
+headers = { }
--- a/docs/rfcs/2023-07-06-table-engine-refactor.md
+++ b/docs/rfcs/2023-07-06-table-engine-refactor.md
@@ -27,8 +27,8 @@ subgraph Frontend["Frontend"]
    end
 end

-MyTable --> MetaSrv
-MetaSrv --> ETCD
+MyTable --> Metasrv
+Metasrv --> ETCD

 MyTable-->TableEngine0
 MyTable-->TableEngine1
@@ -95,8 +95,8 @@ subgraph Frontend["Frontend"]
    end
 end

-MyTable --> MetaSrv
-MetaSrv --> ETCD
+MyTable --> Metasrv
+Metasrv --> ETCD

 MyTable-->RegionEngine
 MyTable-->RegionEngine1
--- a/docs/rfcs/2024-01-17-dataflow-framework.md
+++ b/docs/rfcs/2024-01-17-dataflow-framework.md
@@ -36,7 +36,7 @@ Hence, we choose the third option, and use a simple logical plan that's anagonis
 ## Deploy mode and protocol
 - Greptime Flow is an independent streaming compute component. It can be used either within a standalone node or as a dedicated node at the same level as frontend in distributed mode.
 - It accepts insert request Rows, which is used between frontend and datanode.
- New flow job is submitted in the format of modified SQL query like snowflake do, like: `CREATE TASK avg_over_5m WINDOW_SIZE = "5m" AS SELECT avg(value) FROM table WHERE time > now() - 5m GROUP BY time(1m)`. Flow job then got stored in MetaSrv.
+- New flow job is submitted in the format of modified SQL query like snowflake do, like: `CREATE TASK avg_over_5m WINDOW_SIZE = "5m" AS SELECT avg(value) FROM table WHERE time > now() - 5m GROUP BY time(1m)`. Flow job then got stored in Metasrv.
 - It also persists results in the format of Rows to frontend.
 - The query plan uses Substrait as codec format. It's the same with GreptimeDB's query engine.
 - Greptime Flow needs a WAL for recovering. It's possible to reuse datanode's.
--- a/src/catalog/src/error.rs
+++ b/src/catalog/src/error.rs
@@ -216,7 +216,7 @@ pub enum Error {
    },

    #[snafu(display("Failed to perform metasrv operation"))]
-    MetaSrv {
+    Metasrv {
        location: Location,
        source: meta_client::error::Error,
    },
@@ -304,7 +304,7 @@ impl ErrorExt for Error {
            | Error::CreateTable { source, .. }
            | Error::TableSchemaMismatch { source, .. } => source.status_code(),

-            Error::MetaSrv { source, .. } => source.status_code(),
+            Error::Metasrv { source, .. } => source.status_code(),
            Error::SystemCatalogTableScan { source, .. } => source.status_code(),
            Error::SystemCatalogTableScanExec { source, .. } => source.status_code(),
            Error::InvalidTableInfoInCatalog { source, .. } => source.status_code(),
--- a/src/catalog/src/information_schema.rs
+++ b/src/catalog/src/information_schema.rs
@@ -20,6 +20,7 @@ mod predicate;
 mod region_peers;
 mod runtime_metrics;
 pub mod schemata;
+mod table_constraints;
 mod table_names;
 pub mod tables;

@@ -52,6 +53,7 @@ use crate::information_schema::partitions::InformationSchemaPartitions;
 use crate::information_schema::region_peers::InformationSchemaRegionPeers;
 use crate::information_schema::runtime_metrics::InformationSchemaMetrics;
 use crate::information_schema::schemata::InformationSchemaSchemata;
+use crate::information_schema::table_constraints::InformationSchemaTableConstraints;
 use crate::information_schema::tables::InformationSchemaTables;
 use crate::CatalogManager;

@@ -173,6 +175,10 @@ impl InformationSchemaProvider {
            KEY_COLUMN_USAGE.to_string(),
            self.build_table(KEY_COLUMN_USAGE).unwrap(),
        );
+        tables.insert(
+            TABLE_CONSTRAINTS.to_string(),
+            self.build_table(TABLE_CONSTRAINTS).unwrap(),
+        );

        // Add memory tables
        for name in MEMORY_TABLES.iter() {
@@ -241,6 +247,10 @@ impl InformationSchemaProvider {
                self.catalog_name.clone(),
                self.catalog_manager.clone(),
            )) as _),
+            TABLE_CONSTRAINTS => Some(Arc::new(InformationSchemaTableConstraints::new(
+                self.catalog_name.clone(),
+                self.catalog_manager.clone(),
+            )) as _),
            _ => None,
        }
    }
--- a/src/catalog/src/information_schema/columns.rs
+++ b/src/catalog/src/information_schema/columns.rs
@@ -274,8 +274,8 @@ impl InformationSchemaColumnsBuilder {
                    };

                    self.add_column(
-                        idx,
                        &predicates,
+                        idx,
                        &catalog_name,
                        &schema_name,
                        &table.table_info().name,
@@ -292,8 +292,8 @@ impl InformationSchemaColumnsBuilder {
    #[allow(clippy::too_many_arguments)]
    fn add_column(
        &mut self,
-        index: usize,
        predicates: &Predicates,
+        index: usize,
        catalog_name: &str,
        schema_name: &str,
        table_name: &str,
--- a/src/catalog/src/information_schema/key_column_usage.rs
+++ b/src/catalog/src/information_schema/key_column_usage.rs
@@ -49,6 +49,11 @@ pub const COLUMN_NAME: &str = "column_name";
 pub const ORDINAL_POSITION: &str = "ordinal_position";
 const INIT_CAPACITY: usize = 42;

+/// Primary key constraint name
+pub(crate) const PRI_CONSTRAINT_NAME: &str = "PRIMARY";
+/// Time index constraint name
+pub(crate) const TIME_INDEX_CONSTRAINT_NAME: &str = "TIME INDEX";
+
 /// The virtual table implementation for `information_schema.KEY_COLUMN_USAGE`.
 pub(super) struct InformationSchemaKeyColumnUsage {
    schema: SchemaRef,
@@ -232,7 +237,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
                            self.add_key_column_usage(
                                &predicates,
                                &schema_name,
-                                "TIME INDEX",
+                                TIME_INDEX_CONSTRAINT_NAME,
                                &catalog_name,
                                &schema_name,
                                &table_name,
@@ -262,7 +267,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
            self.add_key_column_usage(
                &predicates,
                &schema_name,
-                "PRIMARY",
+                PRI_CONSTRAINT_NAME,
                &catalog_name,
                &schema_name,
                &table_name,
--- a/src/catalog/src/information_schema/table_constraints.rs
+++ b/src/catalog/src/information_schema/table_constraints.rs
@@ -0,0 +1,286 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::{Arc, Weak};
+
+use arrow_schema::SchemaRef as ArrowSchemaRef;
+use common_catalog::consts::INFORMATION_SCHEMA_TABLE_CONSTRAINTS_TABLE_ID;
+use common_error::ext::BoxedError;
+use common_query::physical_plan::TaskContext;
+use common_recordbatch::adapter::RecordBatchStreamAdapter;
+use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter;
+use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
+use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
+use datatypes::prelude::{ConcreteDataType, MutableVector};
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
+use datatypes::value::Value;
+use datatypes::vectors::{ConstantVector, StringVector, StringVectorBuilder, VectorRef};
+use futures::TryStreamExt;
+use snafu::{OptionExt, ResultExt};
+use store_api::storage::{ScanRequest, TableId};
+
+use super::{InformationTable, TABLE_CONSTRAINTS};
+use crate::error::{
+    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
+};
+use crate::information_schema::key_column_usage::{
+    PRI_CONSTRAINT_NAME, TIME_INDEX_CONSTRAINT_NAME,
+};
+use crate::information_schema::Predicates;
+use crate::CatalogManager;
+
+/// The `TABLE_CONSTRAINTS` table describes which tables have constraints.
+pub(super) struct InformationSchemaTableConstraints {
+    schema: SchemaRef,
+    catalog_name: String,
+    catalog_manager: Weak<dyn CatalogManager>,
+}
+
+const CONSTRAINT_CATALOG: &str = "constraint_catalog";
+const CONSTRAINT_SCHEMA: &str = "constraint_schema";
+const CONSTRAINT_NAME: &str = "constraint_name";
+const TABLE_SCHEMA: &str = "table_schema";
+const TABLE_NAME: &str = "table_name";
+const CONSTRAINT_TYPE: &str = "constraint_type";
+const ENFORCED: &str = "enforced";
+
+const INIT_CAPACITY: usize = 42;
+
+const TIME_INDEX_CONSTRAINT_TYPE: &str = "TIME INDEX";
+const PRI_KEY_CONSTRAINT_TYPE: &str = "PRIMARY KEY";
+
+impl InformationSchemaTableConstraints {
+    pub(super) fn new(catalog_name: String, catalog_manager: Weak<dyn CatalogManager>) -> Self {
+        Self {
+            schema: Self::schema(),
+            catalog_name,
+            catalog_manager,
+        }
+    }
+
+    fn schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            ColumnSchema::new(
+                CONSTRAINT_CATALOG,
+                ConcreteDataType::string_datatype(),
+                false,
+            ),
+            ColumnSchema::new(
+                CONSTRAINT_SCHEMA,
+                ConcreteDataType::string_datatype(),
+                false,
+            ),
+            ColumnSchema::new(CONSTRAINT_NAME, ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(TABLE_SCHEMA, ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(TABLE_NAME, ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(CONSTRAINT_TYPE, ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(ENFORCED, ConcreteDataType::string_datatype(), false),
+        ]))
+    }
+
+    fn builder(&self) -> InformationSchemaTableConstraintsBuilder {
+        InformationSchemaTableConstraintsBuilder::new(
+            self.schema.clone(),
+            self.catalog_name.clone(),
+            self.catalog_manager.clone(),
+        )
+    }
+}
+
+impl InformationTable for InformationSchemaTableConstraints {
+    fn table_id(&self) -> TableId {
+        INFORMATION_SCHEMA_TABLE_CONSTRAINTS_TABLE_ID
+    }
+
+    fn table_name(&self) -> &'static str {
+        TABLE_CONSTRAINTS
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
+        let schema = self.schema.arrow_schema().clone();
+        let mut builder = self.builder();
+        let stream = Box::pin(DfRecordBatchStreamAdapter::new(
+            schema,
+            futures::stream::once(async move {
+                builder
+                    .make_table_constraints(Some(request))
+                    .await
+                    .map(|x| x.into_df_record_batch())
+                    .map_err(Into::into)
+            }),
+        ));
+        Ok(Box::pin(
+            RecordBatchStreamAdapter::try_new(stream)
+                .map_err(BoxedError::new)
+                .context(InternalSnafu)?,
+        ))
+    }
+}
+
+struct InformationSchemaTableConstraintsBuilder {
+    schema: SchemaRef,
+    catalog_name: String,
+    catalog_manager: Weak<dyn CatalogManager>,
+
+    constraint_schemas: StringVectorBuilder,
+    constraint_names: StringVectorBuilder,
+    table_schemas: StringVectorBuilder,
+    table_names: StringVectorBuilder,
+    constraint_types: StringVectorBuilder,
+}
+
+impl InformationSchemaTableConstraintsBuilder {
+    fn new(
+        schema: SchemaRef,
+        catalog_name: String,
+        catalog_manager: Weak<dyn CatalogManager>,
+    ) -> Self {
+        Self {
+            schema,
+            catalog_name,
+            catalog_manager,
+            constraint_schemas: StringVectorBuilder::with_capacity(INIT_CAPACITY),
+            constraint_names: StringVectorBuilder::with_capacity(INIT_CAPACITY),
+            table_schemas: StringVectorBuilder::with_capacity(INIT_CAPACITY),
+            table_names: StringVectorBuilder::with_capacity(INIT_CAPACITY),
+            constraint_types: StringVectorBuilder::with_capacity(INIT_CAPACITY),
+        }
+    }
+
+    /// Construct the `information_schema.table_constraints` virtual table
+    async fn make_table_constraints(
+        &mut self,
+        request: Option<ScanRequest>,
+    ) -> Result<RecordBatch> {
+        let catalog_name = self.catalog_name.clone();
+        let catalog_manager = self
+            .catalog_manager
+            .upgrade()
+            .context(UpgradeWeakCatalogManagerRefSnafu)?;
+        let predicates = Predicates::from_scan_request(&request);
+
+        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
+            let mut stream = catalog_manager.tables(&catalog_name, &schema_name).await;
+
+            while let Some(table) = stream.try_next().await? {
+                let keys = &table.table_info().meta.primary_key_indices;
+                let schema = table.schema();
+
+                if schema.timestamp_index().is_some() {
+                    self.add_table_constraint(
+                        &predicates,
+                        &schema_name,
+                        TIME_INDEX_CONSTRAINT_NAME,
+                        &schema_name,
+                        &table.table_info().name,
+                        TIME_INDEX_CONSTRAINT_TYPE,
+                    );
+                }
+
+                if !keys.is_empty() {
+                    self.add_table_constraint(
+                        &predicates,
+                        &schema_name,
+                        PRI_CONSTRAINT_NAME,
+                        &schema_name,
+                        &table.table_info().name,
+                        PRI_KEY_CONSTRAINT_TYPE,
+                    );
+                }
+            }
+        }
+
+        self.finish()
+    }
+
+    fn add_table_constraint(
+        &mut self,
+        predicates: &Predicates,
+        constraint_schema: &str,
+        constraint_name: &str,
+        table_schema: &str,
+        table_name: &str,
+        constraint_type: &str,
+    ) {
+        let row = [
+            (CONSTRAINT_SCHEMA, &Value::from(constraint_schema)),
+            (CONSTRAINT_NAME, &Value::from(constraint_name)),
+            (TABLE_SCHEMA, &Value::from(table_schema)),
+            (TABLE_NAME, &Value::from(table_name)),
+            (CONSTRAINT_TYPE, &Value::from(constraint_type)),
+        ];
+
+        if !predicates.eval(&row) {
+            return;
+        }
+
+        self.constraint_schemas.push(Some(constraint_schema));
+        self.constraint_names.push(Some(constraint_name));
+        self.table_schemas.push(Some(table_schema));
+        self.table_names.push(Some(table_name));
+        self.constraint_types.push(Some(constraint_type));
+    }
+
+    fn finish(&mut self) -> Result<RecordBatch> {
+        let rows_num = self.constraint_names.len();
+
+        let constraint_catalogs = Arc::new(ConstantVector::new(
+            Arc::new(StringVector::from(vec!["def"])),
+            rows_num,
+        ));
+        let enforceds = Arc::new(ConstantVector::new(
+            Arc::new(StringVector::from(vec!["YES"])),
+            rows_num,
+        ));
+
+        let columns: Vec<VectorRef> = vec![
+            constraint_catalogs,
+            Arc::new(self.constraint_schemas.finish()),
+            Arc::new(self.constraint_names.finish()),
+            Arc::new(self.table_schemas.finish()),
+            Arc::new(self.table_names.finish()),
+            Arc::new(self.constraint_types.finish()),
+            enforceds,
+        ];
+
+        RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu)
+    }
+}
+
+impl DfPartitionStream for InformationSchemaTableConstraints {
+    fn schema(&self) -> &ArrowSchemaRef {
+        self.schema.arrow_schema()
+    }
+
+    fn execute(&self, _: Arc<TaskContext>) -> DfSendableRecordBatchStream {
+        let schema = self.schema.arrow_schema().clone();
+        let mut builder = self.builder();
+        Box::pin(DfRecordBatchStreamAdapter::new(
+            schema,
+            futures::stream::once(async move {
+                builder
+                    .make_table_constraints(None)
+                    .await
+                    .map(|x| x.into_df_record_batch())
+                    .map_err(Into::into)
+            }),
+        ))
+    }
+}
--- a/src/catalog/src/information_schema/table_names.rs
+++ b/src/catalog/src/information_schema/table_names.rs
@@ -41,3 +41,4 @@ pub const SESSION_STATUS: &str = "session_status";
 pub const RUNTIME_METRICS: &str = "runtime_metrics";
 pub const PARTITIONS: &str = "partitions";
 pub const REGION_PEERS: &str = "greptime_region_peers";
+pub const TABLE_CONSTRAINTS: &str = "table_constraints";
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -37,6 +37,8 @@ use snafu::{ensure, ResultExt};
 use crate::error::{ConvertFlightDataSnafu, Error, IllegalFlightMessagesSnafu, ServerSnafu};
 use crate::{error, from_grpc_response, metrics, Client, Result, StreamInserter};

+pub const DEFAULT_LOOKBACK_STRING: &str = "5m";
+
 #[derive(Clone, Debug, Default)]
 pub struct Database {
    // The "catalog" and "schema" to be used in processing the requests at the server side.
@@ -215,6 +217,7 @@ impl Database {
                start: start.to_string(),
                end: end.to_string(),
                step: step.to_string(),
+                lookback: DEFAULT_LOOKBACK_STRING.to_string(),
            })),
        }))
        .await
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -76,6 +76,7 @@ tikv-jemallocator = "0.5"
 common-test-util.workspace = true
 serde.workspace = true
 temp-env = "0.3"
+tempfile.workspace = true

 [target.'cfg(not(windows))'.dev-dependencies]
 rexpect = "0.5"
--- a/src/cmd/src/cli/bench/metadata.rs
+++ b/src/cmd/src/cli/bench/metadata.rs
@@ -107,14 +107,11 @@ impl TableMetadataBencher {
                    .unwrap();
                let start = Instant::now();
                let table_info = table_info.unwrap();
+                let table_route = table_route.unwrap();
                let table_id = table_info.table_info.ident.table_id;
                let _ = self
                    .table_metadata_manager
-                    .delete_table_metadata(
-                        table_id,
-                        &table_info.table_name(),
-                        table_route.unwrap().region_routes().unwrap(),
-                    )
+                    .delete_table_metadata(table_id, &table_info.table_name(), &table_route)
                    .await;
                start.elapsed()
            },
@@ -140,7 +137,7 @@ impl TableMetadataBencher {
                let start = Instant::now();
                let _ = self
                    .table_metadata_manager
-                    .rename_table(table_info.unwrap(), new_table_name)
+                    .rename_table(&table_info.unwrap(), new_table_name)
                    .await;

                start.elapsed()
--- a/src/cmd/src/cli/export.rs
+++ b/src/cmd/src/cli/export.rs
@@ -226,7 +226,10 @@ impl Export {
    }

    async fn show_create_table(&self, catalog: &str, schema: &str, table: &str) -> Result<String> {
-        let sql = format!("show create table {}.{}.{}", catalog, schema, table);
+        let sql = format!(
+            r#"show create table "{}"."{}"."{}""#,
+            catalog, schema, table
+        );
        let mut client = self.client.clone();
        client.set_catalog(catalog);
        client.set_schema(schema);
@@ -273,7 +276,7 @@ impl Export {
                for (c, s, t) in table_list {
                    match self.show_create_table(&c, &s, &t).await {
                        Err(e) => {
-                            error!(e; "Failed to export table {}.{}.{}", c, s, t)
+                            error!(e; r#"Failed to export table "{}"."{}"."{}""#, c, s, t)
                        }
                        Ok(create_table) => {
                            file.write_all(create_table.as_bytes())
@@ -417,3 +420,84 @@ fn split_database(database: &str) -> Result<(String, Option<String>)> {
        Ok((catalog.to_string(), Some(schema.to_string())))
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use clap::Parser;
+    use client::{Client, Database};
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+
+    use crate::error::Result;
+    use crate::options::{CliOptions, Options};
+    use crate::{cli, standalone, App};
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_export_create_table_with_quoted_names() -> Result<()> {
+        let output_dir = tempfile::tempdir().unwrap();
+
+        let standalone = standalone::Command::parse_from([
+            "standalone",
+            "start",
+            "--data-home",
+            &*output_dir.path().to_string_lossy(),
+        ]);
+        let Options::Standalone(standalone_opts) =
+            standalone.load_options(&CliOptions::default())?
+        else {
+            unreachable!()
+        };
+        let mut instance = standalone.build(*standalone_opts).await?;
+        instance.start().await?;
+
+        let client = Client::with_urls(["127.0.0.1:4001"]);
+        let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        database
+            .sql(r#"CREATE DATABASE "cli.export.create_table";"#)
+            .await
+            .unwrap();
+        database
+            .sql(
+                r#"CREATE TABLE "cli.export.create_table"."a.b.c"(
+                        ts TIMESTAMP,
+                        TIME INDEX (ts)
+                    ) engine=mito;
+                "#,
+            )
+            .await
+            .unwrap();
+
+        let output_dir = tempfile::tempdir().unwrap();
+        let cli = cli::Command::parse_from([
+            "cli",
+            "export",
+            "--addr",
+            "127.0.0.1:4001",
+            "--output-dir",
+            &*output_dir.path().to_string_lossy(),
+            "--target",
+            "create-table",
+        ]);
+        let mut cli_app = cli.build().await?;
+        cli_app.start().await?;
+
+        instance.stop().await?;
+
+        let output_file = output_dir
+            .path()
+            .join("greptime-cli.export.create_table.sql");
+        let res = std::fs::read_to_string(output_file).unwrap();
+        let expect = r#"CREATE TABLE IF NOT EXISTS "a.b.c" (
+  "ts" TIMESTAMP(3) NOT NULL,
+  TIME INDEX ("ts")
+)
+
+ENGINE=mito
+WITH(
+  regions = 1
+);
+"#;
+        assert_eq!(res.trim(), expect.trim());
+
+        Ok(())
+    }
+}
--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -17,8 +17,8 @@ use std::time::Duration;
 use async_trait::async_trait;
 use clap::Parser;
 use common_telemetry::logging;
-use meta_srv::bootstrap::MetaSrvInstance;
-use meta_srv::metasrv::MetaSrvOptions;
+use meta_srv::bootstrap::MetasrvInstance;
+use meta_srv::metasrv::MetasrvOptions;
 use snafu::ResultExt;

 use crate::error::{self, Result, StartMetaServerSnafu};
@@ -26,11 +26,11 @@ use crate::options::{CliOptions, Options};
 use crate::App;

 pub struct Instance {
-    instance: MetaSrvInstance,
+    instance: MetasrvInstance,
 }

 impl Instance {
-    fn new(instance: MetaSrvInstance) -> Self {
+    fn new(instance: MetasrvInstance) -> Self {
        Self { instance }
    }
 }
@@ -42,7 +42,7 @@ impl App for Instance {
    }

    async fn start(&mut self) -> Result<()> {
-        plugins::start_meta_srv_plugins(self.instance.plugins())
+        plugins::start_metasrv_plugins(self.instance.plugins())
            .await
            .context(StartMetaServerSnafu)?;

@@ -64,7 +64,7 @@ pub struct Command {
 }

 impl Command {
-    pub async fn build(self, opts: MetaSrvOptions) -> Result<Instance> {
+    pub async fn build(self, opts: MetasrvOptions) -> Result<Instance> {
        self.subcmd.build(opts).await
    }

@@ -79,7 +79,7 @@ enum SubCommand {
 }

 impl SubCommand {
-    async fn build(self, opts: MetaSrvOptions) -> Result<Instance> {
+    async fn build(self, opts: MetasrvOptions) -> Result<Instance> {
        match self {
            SubCommand::Start(cmd) => cmd.build(opts).await,
        }
@@ -127,10 +127,10 @@ struct StartCommand {

 impl StartCommand {
    fn load_options(&self, cli_options: &CliOptions) -> Result<Options> {
-        let mut opts: MetaSrvOptions = Options::load_layered_options(
+        let mut opts: MetasrvOptions = Options::load_layered_options(
            self.config_file.as_deref(),
            self.env_prefix.as_ref(),
-            MetaSrvOptions::env_list_keys(),
+            MetasrvOptions::env_list_keys(),
        )?;

        if let Some(dir) = &cli_options.log_dir {
@@ -193,20 +193,20 @@ impl StartCommand {
        Ok(Options::Metasrv(Box::new(opts)))
    }

-    async fn build(self, mut opts: MetaSrvOptions) -> Result<Instance> {
-        let plugins = plugins::setup_meta_srv_plugins(&mut opts)
+    async fn build(self, mut opts: MetasrvOptions) -> Result<Instance> {
+        let plugins = plugins::setup_metasrv_plugins(&mut opts)
            .await
            .context(StartMetaServerSnafu)?;

-        logging::info!("MetaSrv start command: {:#?}", self);
-        logging::info!("MetaSrv options: {:#?}", opts);
+        logging::info!("Metasrv start command: {:#?}", self);
+        logging::info!("Metasrv options: {:#?}", opts);

        let builder = meta_srv::bootstrap::metasrv_builder(&opts, plugins.clone(), None)
            .await
            .context(error::BuildMetaServerSnafu)?;
        let metasrv = builder.build().await.context(error::BuildMetaServerSnafu)?;

-        let instance = MetaSrvInstance::new(opts, plugins, metasrv)
+        let instance = MetasrvInstance::new(opts, plugins, metasrv)
            .await
            .context(error::BuildMetaServerSnafu)?;

--- a/src/cmd/src/options.rs
+++ b/src/cmd/src/options.rs
@@ -15,12 +15,12 @@
 use clap::ArgMatches;
 use common_config::KvBackendConfig;
 use common_telemetry::logging::{LoggingOptions, TracingOptions};
-use common_wal::config::MetaSrvWalConfig;
+use common_wal::config::MetasrvWalConfig;
 use config::{Config, Environment, File, FileFormat};
 use datanode::config::{DatanodeOptions, ProcedureConfig};
 use frontend::error::{Result as FeResult, TomlFormatSnafu};
 use frontend::frontend::{FrontendOptions, TomlSerializable};
-use meta_srv::metasrv::MetaSrvOptions;
+use meta_srv::metasrv::MetasrvOptions;
 use serde::{Deserialize, Serialize};
 use snafu::ResultExt;

@@ -38,7 +38,7 @@ pub struct MixOptions {
    pub frontend: FrontendOptions,
    pub datanode: DatanodeOptions,
    pub logging: LoggingOptions,
-    pub wal_meta: MetaSrvWalConfig,
+    pub wal_meta: MetasrvWalConfig,
 }

 impl From<MixOptions> for FrontendOptions {
@@ -56,7 +56,7 @@ impl TomlSerializable for MixOptions {
 pub enum Options {
    Datanode(Box<DatanodeOptions>),
    Frontend(Box<FrontendOptions>),
-    Metasrv(Box<MetaSrvOptions>),
+    Metasrv(Box<MetasrvOptions>),
    Standalone(Box<MixOptions>),
    Cli(Box<LoggingOptions>),
 }
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -86,6 +86,8 @@ pub const INFORMATION_SCHEMA_RUNTIME_METRICS_TABLE_ID: u32 = 27;
 pub const INFORMATION_SCHEMA_PARTITIONS_TABLE_ID: u32 = 28;
 /// id for information_schema.REGION_PEERS
 pub const INFORMATION_SCHEMA_REGION_PEERS_TABLE_ID: u32 = 29;
+/// id for information_schema.columns
+pub const INFORMATION_SCHEMA_TABLE_CONSTRAINTS_TABLE_ID: u32 = 30;
 /// ----- End of information_schema tables -----

 pub const MITO_ENGINE: &str = "mito";
--- a/src/common/datasource/src/buffered_writer.rs
+++ b/src/common/datasource/src/buffered_writer.rs
@@ -60,12 +60,6 @@ impl<
            .context(error::BufferedWriterClosedSnafu)?;
        let metadata = encoder.close().await?;

-        // Use `rows_written` to keep a track of if any rows have been written.
-        // If no row's been written, then we can simply close the underlying
-        // writer without flush so that no file will be actually created.
-        if self.rows_written != 0 {
-            self.bytes_written += self.try_flush(true).await?;
-        }
        // It's important to shut down! flushes all pending writes
        self.close_inner_writer().await?;
        Ok((metadata, self.bytes_written))
@@ -79,8 +73,15 @@ impl<
        Fut: Future<Output = Result<T>>,
    > LazyBufferedWriter<T, U, F>
 {
-    /// Closes the writer without flushing the buffer data.
+    /// Closes the writer and flushes the buffer data.
    pub async fn close_inner_writer(&mut self) -> Result<()> {
+        // Use `rows_written` to keep a track of if any rows have been written.
+        // If no row's been written, then we can simply close the underlying
+        // writer without flush so that no file will be actually created.
+        if self.rows_written != 0 {
+            self.bytes_written += self.try_flush(true).await?;
+        }
+
        if let Some(writer) = &mut self.writer {
            writer.shutdown().await.context(error::AsyncWriteSnafu)?;
        }
@@ -117,7 +118,7 @@ impl<
        Ok(())
    }

-    pub async fn try_flush(&mut self, all: bool) -> Result<u64> {
+    async fn try_flush(&mut self, all: bool) -> Result<u64> {
        let mut bytes_written: u64 = 0;

        // Once buffered data size reaches threshold, split the data in chunks (typically 4MB)
--- a/src/common/datasource/src/file_format.rs
+++ b/src/common/datasource/src/file_format.rs
@@ -213,10 +213,6 @@ pub async fn stream_to_file<T: DfRecordBatchEncoder, U: Fn(SharedBuffer) -> T>(
        writer.write(&batch).await?;
        rows += batch.num_rows();
    }
-
-    // Flushes all pending writes
-    let _ = writer.try_flush(true).await?;
    writer.close_inner_writer().await?;
-
    Ok(rows)
 }
--- a/src/common/datasource/src/file_format/parquet.rs
+++ b/src/common/datasource/src/file_format/parquet.rs
@@ -215,10 +215,7 @@ impl BufferedWriter {

    /// Write a record batch to stream writer.
    pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> {
-        self.inner.write(arrow_batch).await?;
-        self.inner.try_flush(false).await?;
-
-        Ok(())
+        self.inner.write(arrow_batch).await
    }

    /// Close parquet writer.
--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -50,11 +50,13 @@ pub trait ClusterInfo {
 }

 /// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-{cluster_id}-{role}-{node_id}`.
+/// This key cannot be used to describe the `Metasrv` because the `Metasrv` does not have
+/// a `cluster_id`, it serves multiple clusters.
 #[derive(Debug, Clone, Eq, Hash, PartialEq, Serialize, Deserialize)]
 pub struct NodeInfoKey {
    /// The cluster id.
    pub cluster_id: u64,
-    /// The role of the node. It can be [Role::Datanode], [Role::Frontend], or [Role::Metasrv].
+    /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
    pub role: Role,
    /// The node id.
    pub node_id: u64,
--- a/src/common/meta/src/ddl/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables.rs
@@ -35,6 +35,7 @@ use crate::ddl::DdlContext;
 use crate::error::{DecodeJsonSnafu, Error, MetadataCorruptionSnafu, Result};
 use crate::key::table_info::TableInfoValue;
 use crate::key::table_route::PhysicalTableRouteValue;
+use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
 use crate::rpc::ddl::AlterTableTask;
 use crate::rpc::router::find_leaders;
@@ -245,10 +246,10 @@ pub struct AlterTablesData {
    tasks: Vec<AlterTableTask>,
    /// Table info values before the alter operation.
    /// Corresponding one-to-one with the AlterTableTask in tasks.
-    table_info_values: Vec<TableInfoValue>,
+    table_info_values: Vec<DeserializedValueWithBytes<TableInfoValue>>,
    /// Physical table info
    physical_table_id: TableId,
-    physical_table_info: Option<TableInfoValue>,
+    physical_table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
    physical_table_route: Option<PhysicalTableRouteValue>,
    physical_columns: Vec<ColumnMetadata>,
 }
--- a/src/common/meta/src/ddl/alter_logical_tables/metadata.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/metadata.rs
@@ -24,6 +24,7 @@ use crate::error::{
 use crate::key::table_info::TableInfoValue;
 use crate::key::table_name::TableNameKey;
 use crate::key::table_route::TableRouteValue;
+use crate::key::DeserializedValueWithBytes;
 use crate::rpc::ddl::AlterTableTask;

 impl AlterLogicalTablesProcedure {
@@ -61,11 +62,9 @@ impl AlterLogicalTablesProcedure {
            .get_full_table_info(self.data.physical_table_id)
            .await?;

-        let physical_table_info = physical_table_info
-            .with_context(|| TableInfoNotFoundSnafu {
-                table: format!("table id - {}", self.data.physical_table_id),
-            })?
-            .into_inner();
+        let physical_table_info = physical_table_info.with_context(|| TableInfoNotFoundSnafu {
+            table: format!("table id - {}", self.data.physical_table_id),
+        })?;
        let physical_table_route = physical_table_route
            .context(TableRouteNotFoundSnafu {
                table_id: self.data.physical_table_id,
@@ -99,9 +98,9 @@ impl AlterLogicalTablesProcedure {
    async fn get_all_table_info_values(
        &self,
        table_ids: &[TableId],
-    ) -> Result<Vec<TableInfoValue>> {
+    ) -> Result<Vec<DeserializedValueWithBytes<TableInfoValue>>> {
        let table_info_manager = self.context.table_metadata_manager.table_info_manager();
-        let mut table_info_map = table_info_manager.batch_get(table_ids).await?;
+        let mut table_info_map = table_info_manager.batch_get_raw(table_ids).await?;
        let mut table_info_values = Vec::with_capacity(table_ids.len());
        for (table_id, task) in table_ids.iter().zip(self.data.tasks.iter()) {
            let table_info_value =
--- a/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
@@ -33,6 +33,7 @@ impl AlterLogicalTablesProcedure {
            return Ok(());
        }

+        // Safety: must exist.
        let physical_table_info = self.data.physical_table_info.as_ref().unwrap();

        // Generates new table info
@@ -45,10 +46,7 @@ impl AlterLogicalTablesProcedure {
        // Updates physical table's metadata
        self.context
            .table_metadata_manager
-            .update_table_info(
-                DeserializedValueWithBytes::from_inner(physical_table_info.clone()),
-                new_raw_table_info,
-            )
+            .update_table_info(physical_table_info, new_raw_table_info)
            .await?;

        Ok(())
@@ -77,7 +75,9 @@ impl AlterLogicalTablesProcedure {
        Ok(())
    }

-    pub(crate) fn build_update_metadata(&self) -> Result<Vec<(TableInfoValue, RawTableInfo)>> {
+    pub(crate) fn build_update_metadata(
+        &self,
+    ) -> Result<Vec<(DeserializedValueWithBytes<TableInfoValue>, RawTableInfo)>> {
        let mut table_info_values_to_update = Vec::with_capacity(self.data.tasks.len());
        for (task, table) in self
            .data
@@ -94,8 +94,8 @@ impl AlterLogicalTablesProcedure {
    fn build_new_table_info(
        &self,
        task: &AlterTableTask,
-        table: &TableInfoValue,
-    ) -> Result<(TableInfoValue, RawTableInfo)> {
+        table: &DeserializedValueWithBytes<TableInfoValue>,
+    ) -> Result<(DeserializedValueWithBytes<TableInfoValue>, RawTableInfo)> {
        // Builds new_meta
        let table_info = TableInfo::try_from(table.table_info.clone())
            .context(error::ConvertRawTableInfoSnafu)?;
--- a/src/common/meta/src/ddl/alter_table.rs
+++ b/src/common/meta/src/ddl/alter_table.rs
@@ -12,52 +12,49 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+mod check;
+mod metadata;
+mod region_request;
+mod update_metadata;
+
 use std::vec;

 use api::v1::alter_expr::Kind;
-use api::v1::region::{
-    alter_request, region_request, AddColumn, AddColumns, AlterRequest, DropColumn, DropColumns,
-    RegionColumnDef, RegionRequest, RegionRequestHeader,
-};
-use api::v1::{AlterExpr, RenameTable};
+use api::v1::RenameTable;
 use async_trait::async_trait;
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
-use common_grpc_expr::alter_expr_to_request;
 use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
 use common_procedure::{
    Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, Status, StringKey,
 };
-use common_telemetry::tracing_context::TracingContext;
 use common_telemetry::{debug, info};
 use futures::future;
 use serde::{Deserialize, Serialize};
-use snafu::{ensure, OptionExt, ResultExt};
-use store_api::storage::{ColumnId, RegionId};
+use snafu::ResultExt;
+use store_api::storage::RegionId;
 use strum::AsRefStr;
-use table::metadata::{RawTableInfo, TableId, TableInfo};
-use table::requests::AlterKind;
+use table::metadata::{RawTableInfo, TableId};
 use table::table_reference::TableReference;

 use crate::cache_invalidator::Context;
 use crate::ddl::utils::add_peer_context_if_needed;
 use crate::ddl::DdlContext;
-use crate::error::{self, ConvertAlterTableRequestSnafu, Error, InvalidProtoMsgSnafu, Result};
+use crate::error::{Error, Result};
 use crate::instruction::CacheIdent;
 use crate::key::table_info::TableInfoValue;
-use crate::key::table_name::TableNameKey;
 use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
 use crate::metrics;
 use crate::rpc::ddl::AlterTableTask;
 use crate::rpc::router::{find_leader_regions, find_leaders};
-use crate::table_name::TableName;

+/// The alter table procedure
 pub struct AlterTableProcedure {
+    // The runtime context.
    context: DdlContext,
+    // The serialized data.
    data: AlterTableData,
-    /// proto alter Kind for adding/dropping columns.
-    kind: Option<alter_request::Kind>,
 }

 impl AlterTableProcedure {
@@ -65,123 +62,36 @@ impl AlterTableProcedure {

    pub fn new(
        cluster_id: u64,
+        table_id: TableId,
        task: AlterTableTask,
-        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
        context: DdlContext,
    ) -> Result<Self> {
-        let alter_kind = task
-            .alter_table
-            .kind
-            .as_ref()
-            .context(InvalidProtoMsgSnafu {
-                err_msg: "'kind' is absent",
-            })?;
-        let (kind, next_column_id) =
-            create_proto_alter_kind(&table_info_value.table_info, alter_kind)?;
-
-        debug!(
-            "New AlterTableProcedure, kind: {:?}, next_column_id: {:?}",
-            kind, next_column_id
-        );
-
+        task.validate()?;
        Ok(Self {
            context,
-            data: AlterTableData::new(task, table_info_value, cluster_id, next_column_id),
-            kind,
+            data: AlterTableData::new(task, table_id, cluster_id),
        })
    }

    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
        let data: AlterTableData = serde_json::from_str(json).context(FromJsonSnafu)?;
-        let alter_kind = data
-            .task
-            .alter_table
-            .kind
-            .as_ref()
-            .context(InvalidProtoMsgSnafu {
-                err_msg: "'kind' is absent",
-            })
-            .map_err(ProcedureError::external)?;
-        let (kind, next_column_id) =
-            create_proto_alter_kind(&data.table_info_value.table_info, alter_kind)
-                .map_err(ProcedureError::external)?;
-        assert_eq!(data.next_column_id, next_column_id);
-
-        Ok(AlterTableProcedure {
-            context,
-            data,
-            kind,
-        })
+        Ok(AlterTableProcedure { context, data })
    }

    // Checks whether the table exists.
-    async fn on_prepare(&mut self) -> Result<Status> {
-        let alter_expr = &self.alter_expr();
-        let catalog = &alter_expr.catalog_name;
-        let schema = &alter_expr.schema_name;
-
-        let alter_kind = self.alter_kind()?;
-        let manager = &self.context.table_metadata_manager;
-
-        if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind {
-            let new_table_name_key = TableNameKey::new(catalog, schema, new_table_name);
-
-            let exist = manager
-                .table_name_manager()
-                .exists(new_table_name_key)
-                .await?;
-
-            ensure!(
-                !exist,
-                error::TableAlreadyExistsSnafu {
-                    table_name: TableName::from(new_table_name_key).to_string(),
-                }
-            )
-        }
-
-        let table_name_key = TableNameKey::new(catalog, schema, &alter_expr.table_name);
-
-        let exist = manager.table_name_manager().exists(table_name_key).await?;
-
-        ensure!(
-            exist,
-            error::TableNotFoundSnafu {
-                table_name: TableName::from(table_name_key).to_string()
-            }
-        );
-
+    pub(crate) async fn on_prepare(&mut self) -> Result<Status> {
+        self.check_alter().await?;
+        self.fill_table_info().await?;
+        // Safety: Checked in `AlterTableProcedure::new`.
+        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
        if matches!(alter_kind, Kind::RenameTable { .. }) {
            self.data.state = AlterTableState::UpdateMetadata;
        } else {
            self.data.state = AlterTableState::SubmitAlterRegionRequests;
        };
-
        Ok(Status::executing(true))
    }

-    fn alter_expr(&self) -> &AlterExpr {
-        &self.data.task.alter_table
-    }
-
-    fn alter_kind(&self) -> Result<&Kind> {
-        self.alter_expr()
-            .kind
-            .as_ref()
-            .context(InvalidProtoMsgSnafu {
-                err_msg: "'kind' is absent",
-            })
-    }
-
-    pub fn create_alter_region_request(&self, region_id: RegionId) -> Result<AlterRequest> {
-        let table_info = self.data.table_info();
-
-        Ok(AlterRequest {
-            region_id: region_id.as_u64(),
-            schema_version: table_info.ident.version,
-            kind: self.kind.clone(),
-        })
-    }
-
    pub async fn submit_alter_region_requests(&mut self) -> Result<Status> {
        let table_id = self.data.table_id();
        let (_, physical_table_route) = self
@@ -200,14 +110,7 @@ impl AlterTableProcedure {

            for region in regions {
                let region_id = RegionId::new(table_id, region);
-                let request = self.create_alter_region_request(region_id)?;
-                let request = RegionRequest {
-                    header: Some(RegionRequestHeader {
-                        tracing_context: TracingContext::from_current_span().to_w3c(),
-                        ..Default::default()
-                    }),
-                    body: Some(region_request::Body::Alter(request)),
-                };
+                let request = self.make_alter_region_request(region_id)?;
                debug!("Submitting {request:?} to {datanode}");

                let datanode = datanode.clone();
@@ -238,91 +141,39 @@ impl AlterTableProcedure {
        Ok(Status::executing(true))
    }

-    /// Update table metadata for rename table operation.
-    async fn on_update_metadata_for_rename(&self, new_table_name: String) -> Result<()> {
-        let table_metadata_manager = &self.context.table_metadata_manager;
-
-        let current_table_info_value = self.data.table_info_value.clone();
-
-        table_metadata_manager
-            .rename_table(current_table_info_value, new_table_name)
-            .await?;
-
-        Ok(())
-    }
-
-    async fn on_update_metadata_for_alter(&self, new_table_info: RawTableInfo) -> Result<()> {
-        let table_metadata_manager = &self.context.table_metadata_manager;
-        let current_table_info_value = self.data.table_info_value.clone();
-
-        table_metadata_manager
-            .update_table_info(current_table_info_value, new_table_info)
-            .await?;
-
-        Ok(())
-    }
-
-    fn build_new_table_info(&self) -> Result<TableInfo> {
-        // Builds new_meta
-        let table_info = TableInfo::try_from(self.data.table_info().clone())
-            .context(error::ConvertRawTableInfoSnafu)?;
-
-        let table_ref = self.data.table_ref();
-
-        let request = alter_expr_to_request(self.data.table_id(), self.alter_expr().clone())
-            .context(ConvertAlterTableRequestSnafu)?;
-
-        let new_meta = table_info
-            .meta
-            .builder_with_alter_kind(table_ref.table, &request.alter_kind, false)
-            .context(error::TableSnafu)?
-            .build()
-            .with_context(|_| error::BuildTableMetaSnafu {
-                table_name: table_ref.table,
-            })?;
-
-        let mut new_info = table_info.clone();
-        new_info.meta = new_meta;
-        new_info.ident.version = table_info.ident.version + 1;
-        if let Some(column_id) = self.data.next_column_id {
-            new_info.meta.next_column_id = new_info.meta.next_column_id.max(column_id);
-        }
-
-        if let AlterKind::RenameTable { new_table_name } = &request.alter_kind {
-            new_info.name = new_table_name.to_string();
-        }
-
-        Ok(new_info)
-    }
-
    /// Update table metadata.
-    async fn on_update_metadata(&mut self) -> Result<Status> {
+    pub(crate) async fn on_update_metadata(&mut self) -> Result<Status> {
        let table_id = self.data.table_id();
        let table_ref = self.data.table_ref();
-        let new_info = self.build_new_table_info()?;
+        // Safety: checked before.
+        let table_info_value = self.data.table_info_value.as_ref().unwrap();
+        let new_info = self.build_new_table_info(&table_info_value.table_info)?;

        debug!(
-            "starting update table: {} metadata, new table info {:?}",
+            "Starting update table: {} metadata, new table info {:?}",
            table_ref.to_string(),
            new_info
        );

-        if let Kind::RenameTable(RenameTable { new_table_name }) = self.alter_kind()? {
-            self.on_update_metadata_for_rename(new_table_name.to_string())
+        // Safety: Checked in `AlterTableProcedure::new`.
+        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
+        if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind {
+            self.on_update_metadata_for_rename(new_table_name.to_string(), table_info_value)
                .await?;
        } else {
-            self.on_update_metadata_for_alter(new_info.into()).await?;
+            self.on_update_metadata_for_alter(new_info.into(), table_info_value)
+                .await?;
        }

        info!("Updated table metadata for table {table_ref}, table_id: {table_id}");
-
        self.data.state = AlterTableState::InvalidateTableCache;
        Ok(Status::executing(true))
    }

    /// Broadcasts the invalidating table cache instructions.
    async fn on_broadcast(&mut self) -> Result<Status> {
-        let alter_kind = self.alter_kind()?;
+        // Safety: Checked in `AlterTableProcedure::new`.
+        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
        let cache_invalidator = &self.context.cache_invalidator;
        let cache_keys = if matches!(alter_kind, Kind::RenameTable { .. }) {
            vec![CacheIdent::TableName(self.data.table_ref().into())]
@@ -348,7 +199,9 @@ impl AlterTableProcedure {
        lock_key.push(SchemaLock::read(table_ref.catalog, table_ref.schema).into());
        lock_key.push(TableLock::Write(table_id).into());

-        if let Ok(Kind::RenameTable(RenameTable { new_table_name })) = self.alter_kind() {
+        // Safety: Checked in `AlterTableProcedure::new`.
+        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
+        if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind {
            lock_key.push(
                TableNameLock::new(table_ref.catalog, table_ref.schema, new_table_name).into(),
            )
@@ -403,8 +256,9 @@ impl Procedure for AlterTableProcedure {

 #[derive(Debug, Serialize, Deserialize, AsRefStr)]
 enum AlterTableState {
-    /// Prepares to alter the table
+    /// Prepares to alter the table.
    Prepare,
+    /// Sends alter region requests to Datanode.
    SubmitAlterRegionRequests,
    /// Updates table metadata.
    UpdateMetadata,
@@ -412,30 +266,25 @@ enum AlterTableState {
    InvalidateTableCache,
 }

+// The serialized data of alter table.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AlterTableData {
    cluster_id: u64,
    state: AlterTableState,
    task: AlterTableTask,
+    table_id: TableId,
    /// Table info value before alteration.
-    table_info_value: DeserializedValueWithBytes<TableInfoValue>,
-    /// Next column id of the table if the task adds columns to the table.
-    next_column_id: Option<ColumnId>,
+    table_info_value: Option<DeserializedValueWithBytes<TableInfoValue>>,
 }

 impl AlterTableData {
-    pub fn new(
-        task: AlterTableTask,
-        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
-        cluster_id: u64,
-        next_column_id: Option<ColumnId>,
-    ) -> Self {
+    pub fn new(task: AlterTableTask, table_id: TableId, cluster_id: u64) -> Self {
        Self {
            state: AlterTableState::Prepare,
            task,
-            table_info_value,
+            table_id,
            cluster_id,
-            next_column_id,
+            table_info_value: None,
        }
    }

@@ -444,76 +293,12 @@ impl AlterTableData {
    }

    fn table_id(&self) -> TableId {
-        self.table_info().ident.table_id
+        self.table_id
    }

-    fn table_info(&self) -> &RawTableInfo {
-        &self.table_info_value.table_info
-    }
-}
-
-/// Creates region proto alter kind from `table_info` and `alter_kind`.
-///
-/// Returns the kind and next column id if it adds new columns.
-///
-/// # Panics
-/// Panics if kind is rename.
-pub fn create_proto_alter_kind(
-    table_info: &RawTableInfo,
-    alter_kind: &Kind,
-) -> Result<(Option<alter_request::Kind>, Option<ColumnId>)> {
-    match alter_kind {
-        Kind::AddColumns(x) => {
-            let mut next_column_id = table_info.meta.next_column_id;
-
-            let add_columns = x
-                .add_columns
-                .iter()
-                .map(|add_column| {
-                    let column_def =
-                        add_column
-                            .column_def
-                            .as_ref()
-                            .context(InvalidProtoMsgSnafu {
-                                err_msg: "'column_def' is absent",
-                            })?;
-
-                    let column_id = next_column_id;
-                    next_column_id += 1;
-
-                    let column_def = RegionColumnDef {
-                        column_def: Some(column_def.clone()),
-                        column_id,
-                    };
-
-                    Ok(AddColumn {
-                        column_def: Some(column_def),
-                        location: add_column.location.clone(),
-                    })
-                })
-                .collect::<Result<Vec<_>>>()?;
-
-            Ok((
-                Some(alter_request::Kind::AddColumns(AddColumns { add_columns })),
-                Some(next_column_id),
-            ))
-        }
-        Kind::DropColumns(x) => {
-            let drop_columns = x
-                .drop_columns
-                .iter()
-                .map(|x| DropColumn {
-                    name: x.name.clone(),
-                })
-                .collect::<Vec<_>>();
-
-            Ok((
-                Some(alter_request::Kind::DropColumns(DropColumns {
-                    drop_columns,
-                })),
-                None,
-            ))
-        }
-        Kind::RenameTable(_) => Ok((None, None)),
+    fn table_info(&self) -> Option<&RawTableInfo> {
+        self.table_info_value
+            .as_ref()
+            .map(|value| &value.table_info)
    }
 }
--- a/src/common/meta/src/ddl/alter_table/check.rs
+++ b/src/common/meta/src/ddl/alter_table/check.rs
@@ -0,0 +1,62 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::alter_expr::Kind;
+use api::v1::RenameTable;
+use common_catalog::format_full_table_name;
+use snafu::ensure;
+
+use crate::ddl::alter_table::AlterTableProcedure;
+use crate::error::{self, Result};
+use crate::key::table_name::TableNameKey;
+
+impl AlterTableProcedure {
+    /// Checks:
+    /// - The new table name doesn't exist (rename).
+    /// - Table exists.
+    pub(crate) async fn check_alter(&self) -> Result<()> {
+        let alter_expr = &self.data.task.alter_table;
+        let catalog = &alter_expr.catalog_name;
+        let schema = &alter_expr.schema_name;
+        let table_name = &alter_expr.table_name;
+        // Safety: Checked in `AlterTableProcedure::new`.
+        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
+
+        let manager = &self.context.table_metadata_manager;
+        if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind {
+            let new_table_name_key = TableNameKey::new(catalog, schema, new_table_name);
+            let exists = manager
+                .table_name_manager()
+                .exists(new_table_name_key)
+                .await?;
+            ensure!(
+                !exists,
+                error::TableAlreadyExistsSnafu {
+                    table_name: format_full_table_name(catalog, schema, new_table_name),
+                }
+            )
+        }
+
+        let table_name_key = TableNameKey::new(catalog, schema, table_name);
+        let exists = manager.table_name_manager().exists(table_name_key).await?;
+        ensure!(
+            exists,
+            error::TableNotFoundSnafu {
+                table_name: format_full_table_name(catalog, schema, &alter_expr.table_name),
+            }
+        );
+
+        Ok(())
+    }
+}
--- a/src/common/meta/src/ddl/alter_table/metadata.rs
+++ b/src/common/meta/src/ddl/alter_table/metadata.rs
@@ -0,0 +1,42 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_catalog::format_full_table_name;
+use snafu::OptionExt;
+
+use crate::ddl::alter_table::AlterTableProcedure;
+use crate::error::{self, Result};
+
+impl AlterTableProcedure {
+    /// Fetches the table info.
+    pub(crate) async fn fill_table_info(&mut self) -> Result<()> {
+        let table_id = self.data.table_id();
+        let alter_expr = &self.data.task.alter_table;
+        let catalog = &alter_expr.catalog_name;
+        let schema = &alter_expr.schema_name;
+        let table_name = &alter_expr.table_name;
+
+        let table_info_value = self
+            .context
+            .table_metadata_manager
+            .table_info_manager()
+            .get(table_id)
+            .await?
+            .with_context(|| error::TableNotFoundSnafu {
+                table_name: format_full_table_name(catalog, schema, table_name),
+            })?;
+        self.data.table_info_value = Some(table_info_value);
+        Ok(())
+    }
+}
--- a/src/common/meta/src/ddl/alter_table/region_request.rs
+++ b/src/common/meta/src/ddl/alter_table/region_request.rs
@@ -0,0 +1,258 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::alter_expr::Kind;
+use api::v1::region::region_request::Body;
+use api::v1::region::{
+    alter_request, AddColumn, AddColumns, AlterRequest, DropColumn, DropColumns, RegionColumnDef,
+    RegionRequest, RegionRequestHeader,
+};
+use common_telemetry::tracing_context::TracingContext;
+use snafu::OptionExt;
+use store_api::storage::RegionId;
+use table::metadata::RawTableInfo;
+
+use crate::ddl::alter_table::AlterTableProcedure;
+use crate::error::{InvalidProtoMsgSnafu, Result};
+
+impl AlterTableProcedure {
+    /// Makes alter region request.
+    pub(crate) fn make_alter_region_request(&self, region_id: RegionId) -> Result<RegionRequest> {
+        // Safety: Checked in `AlterTableProcedure::new`.
+        let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap();
+        // Safety: checked
+        let table_info = self.data.table_info().unwrap();
+        let kind = create_proto_alter_kind(table_info, alter_kind)?;
+
+        Ok(RegionRequest {
+            header: Some(RegionRequestHeader {
+                tracing_context: TracingContext::from_current_span().to_w3c(),
+                ..Default::default()
+            }),
+            body: Some(Body::Alter(AlterRequest {
+                region_id: region_id.as_u64(),
+                schema_version: table_info.ident.version,
+                kind,
+            })),
+        })
+    }
+}
+
+/// Creates region proto alter kind from `table_info` and `alter_kind`.
+///
+/// Returns the kind and next column id if it adds new columns.
+fn create_proto_alter_kind(
+    table_info: &RawTableInfo,
+    alter_kind: &Kind,
+) -> Result<Option<alter_request::Kind>> {
+    match alter_kind {
+        Kind::AddColumns(x) => {
+            let mut next_column_id = table_info.meta.next_column_id;
+
+            let add_columns = x
+                .add_columns
+                .iter()
+                .map(|add_column| {
+                    let column_def =
+                        add_column
+                            .column_def
+                            .as_ref()
+                            .context(InvalidProtoMsgSnafu {
+                                err_msg: "'column_def' is absent",
+                            })?;
+
+                    let column_id = next_column_id;
+                    next_column_id += 1;
+
+                    let column_def = RegionColumnDef {
+                        column_def: Some(column_def.clone()),
+                        column_id,
+                    };
+
+                    Ok(AddColumn {
+                        column_def: Some(column_def),
+                        location: add_column.location.clone(),
+                    })
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            Ok(Some(alter_request::Kind::AddColumns(AddColumns {
+                add_columns,
+            })))
+        }
+        Kind::DropColumns(x) => {
+            let drop_columns = x
+                .drop_columns
+                .iter()
+                .map(|x| DropColumn {
+                    name: x.name.clone(),
+                })
+                .collect::<Vec<_>>();
+
+            Ok(Some(alter_request::Kind::DropColumns(DropColumns {
+                drop_columns,
+            })))
+        }
+        Kind::RenameTable(_) => Ok(None),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    use api::v1::add_column_location::LocationType;
+    use api::v1::alter_expr::Kind;
+    use api::v1::region::region_request::Body;
+    use api::v1::region::RegionColumnDef;
+    use api::v1::{
+        region, AddColumn, AddColumnLocation, AddColumns, AlterExpr, ColumnDataType,
+        ColumnDef as PbColumnDef, SemanticType,
+    };
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+    use store_api::storage::RegionId;
+
+    use crate::ddl::alter_table::AlterTableProcedure;
+    use crate::ddl::test_util::columns::TestColumnDefBuilder;
+    use crate::ddl::test_util::create_table::{
+        build_raw_table_info_from_expr, TestCreateTableExprBuilder,
+    };
+    use crate::key::table_route::TableRouteValue;
+    use crate::peer::Peer;
+    use crate::rpc::ddl::AlterTableTask;
+    use crate::rpc::router::{Region, RegionRoute};
+    use crate::test_util::{new_ddl_context, MockDatanodeManager};
+
+    #[tokio::test]
+    async fn test_make_alter_region_request() {
+        let datanode_manager = Arc::new(MockDatanodeManager::new(()));
+        let ddl_context = new_ddl_context(datanode_manager);
+        let cluster_id = 1;
+        let table_id = 1024;
+        let region_id = RegionId::new(table_id, 1);
+        let table_name = "foo";
+
+        let create_table = TestCreateTableExprBuilder::default()
+            .column_defs([
+                TestColumnDefBuilder::default()
+                    .name("ts")
+                    .data_type(ColumnDataType::TimestampMillisecond)
+                    .semantic_type(SemanticType::Timestamp)
+                    .build()
+                    .unwrap()
+                    .into(),
+                TestColumnDefBuilder::default()
+                    .name("host")
+                    .data_type(ColumnDataType::String)
+                    .semantic_type(SemanticType::Tag)
+                    .build()
+                    .unwrap()
+                    .into(),
+                TestColumnDefBuilder::default()
+                    .name("cpu")
+                    .data_type(ColumnDataType::Float64)
+                    .semantic_type(SemanticType::Field)
+                    .build()
+                    .unwrap()
+                    .into(),
+            ])
+            .table_id(table_id)
+            .time_index("ts")
+            .primary_keys(["host".into()])
+            .table_name(table_name)
+            .build()
+            .unwrap()
+            .into();
+        let table_info = build_raw_table_info_from_expr(&create_table);
+
+        // Puts a value to table name key.
+        ddl_context
+            .table_metadata_manager
+            .create_table_metadata(
+                table_info,
+                TableRouteValue::physical(vec![RegionRoute {
+                    region: Region::new_test(region_id),
+                    leader_peer: Some(Peer::empty(1)),
+                    follower_peers: vec![],
+                    leader_status: None,
+                    leader_down_since: None,
+                }]),
+                HashMap::new(),
+            )
+            .await
+            .unwrap();
+
+        let task = AlterTableTask {
+            alter_table: AlterExpr {
+                catalog_name: DEFAULT_CATALOG_NAME.to_string(),
+                schema_name: DEFAULT_SCHEMA_NAME.to_string(),
+                table_name: table_name.to_string(),
+                kind: Some(Kind::AddColumns(AddColumns {
+                    add_columns: vec![AddColumn {
+                        column_def: Some(PbColumnDef {
+                            name: "my_tag3".to_string(),
+                            data_type: ColumnDataType::String as i32,
+                            is_nullable: true,
+                            default_constraint: b"hello".to_vec(),
+                            semantic_type: SemanticType::Tag as i32,
+                            comment: String::new(),
+                            ..Default::default()
+                        }),
+                        location: Some(AddColumnLocation {
+                            location_type: LocationType::After as i32,
+                            after_column_name: "my_tag2".to_string(),
+                        }),
+                    }],
+                })),
+            },
+        };
+
+        let mut procedure =
+            AlterTableProcedure::new(cluster_id, table_id, task, ddl_context).unwrap();
+        procedure.on_prepare().await.unwrap();
+        let Some(Body::Alter(alter_region_request)) =
+            procedure.make_alter_region_request(region_id).unwrap().body
+        else {
+            unreachable!()
+        };
+        assert_eq!(alter_region_request.region_id, region_id.as_u64());
+        assert_eq!(alter_region_request.schema_version, 1);
+        assert_eq!(
+            alter_region_request.kind,
+            Some(region::alter_request::Kind::AddColumns(
+                region::AddColumns {
+                    add_columns: vec![region::AddColumn {
+                        column_def: Some(RegionColumnDef {
+                            column_def: Some(PbColumnDef {
+                                name: "my_tag3".to_string(),
+                                data_type: ColumnDataType::String as i32,
+                                is_nullable: true,
+                                default_constraint: b"hello".to_vec(),
+                                semantic_type: SemanticType::Tag as i32,
+                                comment: String::new(),
+                                ..Default::default()
+                            }),
+                            column_id: 3,
+                        }),
+                        location: Some(AddColumnLocation {
+                            location_type: LocationType::After as i32,
+                            after_column_name: "my_tag2".to_string(),
+                        }),
+                    }]
+                }
+            ))
+        );
+    }
+}
--- a/src/common/meta/src/ddl/alter_table/update_metadata.rs
+++ b/src/common/meta/src/ddl/alter_table/update_metadata.rs
@@ -0,0 +1,87 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_grpc_expr::alter_expr_to_request;
+use snafu::ResultExt;
+use table::metadata::{RawTableInfo, TableInfo};
+use table::requests::AlterKind;
+
+use crate::ddl::alter_table::AlterTableProcedure;
+use crate::error::{self, Result};
+use crate::key::table_info::TableInfoValue;
+use crate::key::DeserializedValueWithBytes;
+
+impl AlterTableProcedure {
+    /// Builds new_meta
+    pub(crate) fn build_new_table_info(&self, table_info: &RawTableInfo) -> Result<TableInfo> {
+        let table_info =
+            TableInfo::try_from(table_info.clone()).context(error::ConvertRawTableInfoSnafu)?;
+        let table_ref = self.data.table_ref();
+        let alter_expr = self.data.task.alter_table.clone();
+        let request = alter_expr_to_request(self.data.table_id(), alter_expr)
+            .context(error::ConvertAlterTableRequestSnafu)?;
+
+        let new_meta = table_info
+            .meta
+            .builder_with_alter_kind(table_ref.table, &request.alter_kind, false)
+            .context(error::TableSnafu)?
+            .build()
+            .with_context(|_| error::BuildTableMetaSnafu {
+                table_name: table_ref.table,
+            })?;
+
+        let mut new_info = table_info.clone();
+        new_info.meta = new_meta;
+        new_info.ident.version = table_info.ident.version + 1;
+        match request.alter_kind {
+            AlterKind::AddColumns { columns } => {
+                new_info.meta.next_column_id += columns.len() as u32;
+            }
+            AlterKind::RenameTable { new_table_name } => {
+                new_info.name = new_table_name.to_string();
+            }
+            AlterKind::DropColumns { .. } => {}
+        }
+
+        Ok(new_info)
+    }
+
+    /// Updates table metadata for rename table operation.
+    pub(crate) async fn on_update_metadata_for_rename(
+        &self,
+        new_table_name: String,
+        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
+    ) -> Result<()> {
+        let table_metadata_manager = &self.context.table_metadata_manager;
+        table_metadata_manager
+            .rename_table(current_table_info_value, new_table_name)
+            .await?;
+
+        Ok(())
+    }
+
+    /// Updates table metadata for alter table operation.
+    pub(crate) async fn on_update_metadata_for_alter(
+        &self,
+        new_table_info: RawTableInfo,
+        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
+    ) -> Result<()> {
+        let table_metadata_manager = &self.context.table_metadata_manager;
+        table_metadata_manager
+            .update_table_info(current_table_info_value, new_table_info)
+            .await?;
+
+        Ok(())
+    }
+}
--- a/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs
+++ b/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs
@@ -61,7 +61,7 @@ impl CreateLogicalTablesProcedure {
        // Update physical table's metadata
        self.context
            .table_metadata_manager
-            .update_table_info(physical_table_info, new_table_info)
+            .update_table_info(&physical_table_info, new_table_info)
            .await?;

        // Invalid physical table cache
--- a/src/common/meta/src/ddl/drop_database/cursor.rs
+++ b/src/common/meta/src/ddl/drop_database/cursor.rs
@@ -76,6 +76,7 @@ impl DropDatabaseCursor {
                    .await?;
                Ok((
                    Box::new(DropDatabaseExecutor::new(
+                        table_id,
                        table_id,
                        TableName::new(&ctx.catalog, &ctx.schema, &table_name),
                        table_route.region_routes,
@@ -86,6 +87,7 @@ impl DropDatabaseCursor {
            }
            (DropTableTarget::Physical, TableRouteValue::Physical(table_route)) => Ok((
                Box::new(DropDatabaseExecutor::new(
+                    table_id,
                    table_id,
                    TableName::new(&ctx.catalog, &ctx.schema, &table_name),
                    table_route.region_routes,
@@ -220,7 +222,7 @@ mod tests {
            .get_physical_table_route(physical_table_id)
            .await
            .unwrap();
-        assert_eq!(table_route.region_routes, executor.region_routes);
+        assert_eq!(table_route.region_routes, executor.physical_region_routes);
        assert_eq!(executor.target, DropTableTarget::Logical);
    }

--- a/src/common/meta/src/ddl/drop_database/executor.rs
+++ b/src/common/meta/src/ddl/drop_database/executor.rs
@@ -26,6 +26,7 @@ use crate::ddl::drop_database::State;
 use crate::ddl::drop_table::executor::DropTableExecutor;
 use crate::ddl::DdlContext;
 use crate::error::{self, Result};
+use crate::key::table_route::TableRouteValue;
 use crate::region_keeper::OperatingRegionGuard;
 use crate::rpc::router::{operating_leader_regions, RegionRoute};
 use crate::table_name::TableName;
@@ -33,8 +34,10 @@ use crate::table_name::TableName;
 #[derive(Debug, Serialize, Deserialize)]
 pub(crate) struct DropDatabaseExecutor {
    table_id: TableId,
+    physical_table_id: TableId,
    table_name: TableName,
-    pub(crate) region_routes: Vec<RegionRoute>,
+    /// The physical table region routes.
+    pub(crate) physical_region_routes: Vec<RegionRoute>,
    pub(crate) target: DropTableTarget,
    #[serde(skip)]
    dropping_regions: Vec<OperatingRegionGuard>,
@@ -44,14 +47,16 @@ impl DropDatabaseExecutor {
    /// Returns a new [DropDatabaseExecutor].
    pub fn new(
        table_id: TableId,
+        physical_table_id: TableId,
        table_name: TableName,
-        region_routes: Vec<RegionRoute>,
+        physical_region_routes: Vec<RegionRoute>,
        target: DropTableTarget,
    ) -> Self {
        Self {
-            table_name,
            table_id,
-            region_routes,
+            physical_table_id,
+            table_name,
+            physical_region_routes,
            target,
            dropping_regions: vec![],
        }
@@ -60,7 +65,7 @@ impl DropDatabaseExecutor {

 impl DropDatabaseExecutor {
    fn register_dropping_regions(&mut self, ddl_ctx: &DdlContext) -> Result<()> {
-        let dropping_regions = operating_leader_regions(&self.region_routes);
+        let dropping_regions = operating_leader_regions(&self.physical_region_routes);
        let mut dropping_region_guards = Vec::with_capacity(dropping_regions.len());
        for (region_id, datanode_id) in dropping_regions {
            let guard = ddl_ctx
@@ -87,12 +92,18 @@ impl State for DropDatabaseExecutor {
    ) -> Result<(Box<dyn State>, Status)> {
        self.register_dropping_regions(ddl_ctx)?;
        let executor = DropTableExecutor::new(self.table_name.clone(), self.table_id, true);
+        // Deletes metadata for table permanently.
+        let table_route_value = TableRouteValue::new(
+            self.table_id,
+            self.physical_table_id,
+            self.physical_region_routes.clone(),
+        );
        executor
-            .on_remove_metadata(ddl_ctx, &self.region_routes)
+            .on_destroy_metadata(ddl_ctx, &table_route_value)
            .await?;
        executor.invalidate_table_cache(ddl_ctx).await?;
        executor
-            .on_drop_regions(ddl_ctx, &self.region_routes)
+            .on_drop_regions(ddl_ctx, &self.physical_region_routes)
            .await?;
        info!("Table: {}({}) is dropped", self.table_name, self.table_id);

@@ -122,7 +133,9 @@ mod tests {
    use crate::ddl::drop_database::{DropDatabaseContext, DropTableTarget, State};
    use crate::ddl::test_util::{create_logical_table, create_physical_table};
    use crate::error::{self, Error, Result};
+    use crate::key::datanode_table::DatanodeTableKey;
    use crate::peer::Peer;
+    use crate::rpc::router::region_distribution;
    use crate::table_name::TableName;
    use crate::test_util::{new_ddl_context, MockDatanodeHandler, MockDatanodeManager};

@@ -157,6 +170,7 @@ mod tests {
            .unwrap();
        {
            let mut state = DropDatabaseExecutor::new(
+                physical_table_id,
                physical_table_id,
                TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy"),
                table_route.region_routes.clone(),
@@ -181,9 +195,10 @@ mod tests {
            tables: None,
        };
        let mut state = DropDatabaseExecutor::new(
+            physical_table_id,
            physical_table_id,
            TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy"),
-            table_route.region_routes,
+            table_route.region_routes.clone(),
            DropTableTarget::Physical,
        );
        let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap();
@@ -207,6 +222,7 @@ mod tests {
            .unwrap();
        {
            let mut state = DropDatabaseExecutor::new(
+                logical_table_id,
                physical_table_id,
                TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "metric"),
                table_route.region_routes.clone(),
@@ -231,8 +247,9 @@ mod tests {
            tables: None,
        };
        let mut state = DropDatabaseExecutor::new(
+            logical_table_id,
            physical_table_id,
-            TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy"),
+            TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "metric"),
            table_route.region_routes,
            DropTableTarget::Logical,
        );
@@ -240,6 +257,33 @@ mod tests {
        assert!(!status.need_persist());
        let cursor = state.as_any().downcast_ref::<DropDatabaseCursor>().unwrap();
        assert_eq!(cursor.target, DropTableTarget::Logical);
+        // Checks table info
+        ddl_context
+            .table_metadata_manager
+            .table_info_manager()
+            .get(physical_table_id)
+            .await
+            .unwrap()
+            .unwrap();
+        // Checks table route
+        let table_route = ddl_context
+            .table_metadata_manager
+            .table_route_manager()
+            .table_route_storage()
+            .get(physical_table_id)
+            .await
+            .unwrap()
+            .unwrap();
+        let region_routes = table_route.region_routes().unwrap();
+        for datanode_id in region_distribution(region_routes).into_keys() {
+            ddl_context
+                .table_metadata_manager
+                .datanode_table_manager()
+                .get(&DatanodeTableKey::new(datanode_id, physical_table_id))
+                .await
+                .unwrap()
+                .unwrap();
+        }
    }

    #[derive(Clone)]
@@ -279,6 +323,7 @@ mod tests {
            .await
            .unwrap();
        let mut state = DropDatabaseExecutor::new(
+            physical_table_id,
            physical_table_id,
            TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy"),
            table_route.region_routes,
--- a/src/common/meta/src/ddl/drop_table.rs
+++ b/src/common/meta/src/ddl/drop_table.rs
@@ -12,27 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-pub mod executor;
+pub(crate) mod executor;
+mod metadata;

 use async_trait::async_trait;
 use common_procedure::error::{FromJsonSnafu, ToJsonSnafu};
 use common_procedure::{
-    Context as ProcedureContext, LockKey, Procedure, Result as ProcedureResult, Status,
+    Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure,
+    Result as ProcedureResult, Status,
 };
 use common_telemetry::info;
+use common_telemetry::tracing::warn;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};
 use strum::AsRefStr;
-use table::metadata::{RawTableInfo, TableId};
+use table::metadata::TableId;
 use table::table_reference::TableReference;

 use self::executor::DropTableExecutor;
 use crate::ddl::utils::handle_retry_error;
 use crate::ddl::DdlContext;
 use crate::error::{self, Result};
-use crate::key::table_info::TableInfoValue;
 use crate::key::table_route::TableRouteValue;
-use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
 use crate::metrics;
 use crate::region_keeper::OperatingRegionGuard;
@@ -46,50 +47,50 @@ pub struct DropTableProcedure {
    pub data: DropTableData,
    /// The guards of opening regions.
    pub dropping_regions: Vec<OperatingRegionGuard>,
+    /// The drop table executor.
+    executor: DropTableExecutor,
 }

 impl DropTableProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::DropTable";

-    pub fn new(
-        cluster_id: u64,
-        task: DropTableTask,
-        table_route_value: DeserializedValueWithBytes<TableRouteValue>,
-        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
-        context: DdlContext,
-    ) -> Self {
+    pub fn new(cluster_id: u64, task: DropTableTask, context: DdlContext) -> Self {
+        let data = DropTableData::new(cluster_id, task);
+        let executor = data.build_executor();
        Self {
            context,
-            data: DropTableData::new(cluster_id, task, table_route_value, table_info_value),
+            data,
            dropping_regions: vec![],
+            executor,
        }
    }

    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
-        let data = serde_json::from_str(json).context(FromJsonSnafu)?;
+        let data: DropTableData = serde_json::from_str(json).context(FromJsonSnafu)?;
+        let executor = data.build_executor();
        Ok(Self {
            context,
            data,
            dropping_regions: vec![],
+            executor,
        })
    }

-    async fn on_prepare<'a>(&mut self, executor: &DropTableExecutor) -> Result<Status> {
-        if executor.on_prepare(&self.context).await?.stop() {
+    pub(crate) async fn on_prepare<'a>(&mut self) -> Result<Status> {
+        if self.executor.on_prepare(&self.context).await?.stop() {
            return Ok(Status::done());
        }
-        self.data.state = DropTableState::RemoveMetadata;
+        self.fill_table_metadata().await?;
+        self.data.state = DropTableState::DeleteMetadata;

        Ok(Status::executing(true))
    }

    /// Register dropping regions if doesn't exist.
    fn register_dropping_regions(&mut self) -> Result<()> {
-        let region_routes = self.data.region_routes()?;
+        let dropping_regions = operating_leader_regions(&self.data.physical_region_routes);

-        let dropping_regions = operating_leader_regions(region_routes);
-
-        if self.dropping_regions.len() == dropping_regions.len() {
+        if !self.dropping_regions.is_empty() {
            return Ok(());
        }

@@ -112,7 +113,7 @@ impl DropTableProcedure {
    }

    /// Removes the table metadata.
-    async fn on_remove_metadata(&mut self, executor: &DropTableExecutor) -> Result<Status> {
+    pub(crate) async fn on_delete_metadata(&mut self) -> Result<Status> {
        self.register_dropping_regions()?;
        // NOTES: If the meta server is crashed after the `RemoveMetadata`,
        // Corresponding regions of this table on the Datanode will be closed automatically.
@@ -120,8 +121,15 @@ impl DropTableProcedure {

        // TODO(weny): Considers introducing a RegionStatus to indicate the region is dropping.
        let table_id = self.data.table_id();
-        executor
-            .on_remove_metadata(&self.context, self.data.region_routes()?)
+        let table_route_value = &TableRouteValue::new(
+            self.data.task.table_id,
+            // Safety: checked
+            self.data.physical_table_id.unwrap(),
+            self.data.physical_region_routes.clone(),
+        );
+        // Deletes table metadata logically.
+        self.executor
+            .on_delete_metadata(&self.context, table_route_value)
            .await?;
        info!("Deleted table metadata for table {table_id}");
        self.data.state = DropTableState::InvalidateTableCache;
@@ -129,16 +137,31 @@ impl DropTableProcedure {
    }

    /// Broadcasts invalidate table cache instruction.
-    async fn on_broadcast(&mut self, executor: &DropTableExecutor) -> Result<Status> {
-        executor.invalidate_table_cache(&self.context).await?;
+    async fn on_broadcast(&mut self) -> Result<Status> {
+        self.executor.invalidate_table_cache(&self.context).await?;
        self.data.state = DropTableState::DatanodeDropRegions;

        Ok(Status::executing(true))
    }

-    pub async fn on_datanode_drop_regions(&self, executor: &DropTableExecutor) -> Result<Status> {
-        executor
-            .on_drop_regions(&self.context, self.data.region_routes()?)
+    pub async fn on_datanode_drop_regions(&mut self) -> Result<Status> {
+        self.executor
+            .on_drop_regions(&self.context, &self.data.physical_region_routes)
+            .await?;
+        self.data.state = DropTableState::DeleteTombstone;
+        Ok(Status::executing(true))
+    }
+
+    /// Deletes metadata tombstone.
+    async fn on_delete_metadata_tombstone(&self) -> Result<Status> {
+        let table_route_value = &TableRouteValue::new(
+            self.data.task.table_id,
+            // Safety: checked
+            self.data.physical_table_id.unwrap(),
+            self.data.physical_region_routes.clone(),
+        );
+        self.executor
+            .on_delete_metadata_tombstone(&self.context, table_route_value)
            .await?;
        Ok(Status::done())
    }
@@ -151,21 +174,17 @@ impl Procedure for DropTableProcedure {
    }

    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
-        let executor = DropTableExecutor::new(
-            self.data.task.table_name(),
-            self.data.table_id(),
-            self.data.task.drop_if_exists,
-        );
        let state = &self.data.state;
        let _timer = metrics::METRIC_META_PROCEDURE_DROP_TABLE
            .with_label_values(&[state.as_ref()])
            .start_timer();

        match self.data.state {
-            DropTableState::Prepare => self.on_prepare(&executor).await,
-            DropTableState::RemoveMetadata => self.on_remove_metadata(&executor).await,
-            DropTableState::InvalidateTableCache => self.on_broadcast(&executor).await,
-            DropTableState::DatanodeDropRegions => self.on_datanode_drop_regions(&executor).await,
+            DropTableState::Prepare => self.on_prepare().await,
+            DropTableState::DeleteMetadata => self.on_delete_metadata().await,
+            DropTableState::InvalidateTableCache => self.on_broadcast().await,
+            DropTableState::DatanodeDropRegions => self.on_datanode_drop_regions().await,
+            DropTableState::DeleteTombstone => self.on_delete_metadata_tombstone().await,
        }
        .map_err(handle_retry_error)
    }
@@ -185,31 +204,47 @@ impl Procedure for DropTableProcedure {

        LockKey::new(lock_key)
    }
+
+    fn rollback_supported(&self) -> bool {
+        !matches!(self.data.state, DropTableState::Prepare)
+    }
+
+    async fn rollback(&mut self, _: &ProcedureContext) -> ProcedureResult<()> {
+        warn!(
+            "Rolling back the drop table procedure, table: {}",
+            self.data.table_id()
+        );
+
+        let table_route_value = &TableRouteValue::new(
+            self.data.task.table_id,
+            // Safety: checked
+            self.data.physical_table_id.unwrap(),
+            self.data.physical_region_routes.clone(),
+        );
+        self.executor
+            .on_restore_metadata(&self.context, table_route_value)
+            .await
+            .map_err(ProcedureError::external)
+    }
 }

 #[derive(Debug, Serialize, Deserialize)]
-/// TODO(weny): simplify the table data.
 pub struct DropTableData {
    pub state: DropTableState,
    pub cluster_id: u64,
    pub task: DropTableTask,
-    pub table_route_value: DeserializedValueWithBytes<TableRouteValue>,
-    pub table_info_value: DeserializedValueWithBytes<TableInfoValue>,
+    pub physical_region_routes: Vec<RegionRoute>,
+    pub physical_table_id: Option<TableId>,
 }

 impl DropTableData {
-    pub fn new(
-        cluster_id: u64,
-        task: DropTableTask,
-        table_route_value: DeserializedValueWithBytes<TableRouteValue>,
-        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
-    ) -> Self {
+    pub fn new(cluster_id: u64, task: DropTableTask) -> Self {
        Self {
            state: DropTableState::Prepare,
            cluster_id,
            task,
-            table_info_value,
-            table_route_value,
+            physical_region_routes: vec![],
+            physical_table_id: None,
        }
    }

@@ -217,27 +252,30 @@ impl DropTableData {
        self.task.table_ref()
    }

-    fn region_routes(&self) -> Result<&Vec<RegionRoute>> {
-        self.table_route_value.region_routes()
-    }
-
-    fn table_info(&self) -> &RawTableInfo {
-        &self.table_info_value.table_info
-    }
-
    fn table_id(&self) -> TableId {
-        self.table_info().ident.table_id
+        self.task.table_id
+    }
+
+    fn build_executor(&self) -> DropTableExecutor {
+        DropTableExecutor::new(
+            self.task.table_name(),
+            self.task.table_id,
+            self.task.drop_if_exists,
+        )
    }
 }

+/// The state of drop table.
 #[derive(Debug, Serialize, Deserialize, AsRefStr)]
 pub enum DropTableState {
    /// Prepares to drop the table
    Prepare,
-    /// Removes metadata
-    RemoveMetadata,
+    /// Deletes metadata logically
+    DeleteMetadata,
    /// Invalidates Table Cache
    InvalidateTableCache,
    /// Drops regions on Datanode
    DatanodeDropRegions,
+    /// Deletes metadata tombstone permanently
+    DeleteTombstone,
 }
--- a/src/common/meta/src/ddl/drop_table/executor.rs
+++ b/src/common/meta/src/ddl/drop_table/executor.rs
@@ -30,6 +30,7 @@ use crate::ddl::DdlContext;
 use crate::error::{self, Result};
 use crate::instruction::CacheIdent;
 use crate::key::table_name::TableNameKey;
+use crate::key::table_route::TableRouteValue;
 use crate::rpc::router::{find_leader_regions, find_leaders, RegionRoute};
 use crate::table_name::TableName;

@@ -99,14 +100,73 @@ impl DropTableExecutor {
        Ok(Control::Continue(()))
    }

-    /// Removes the table metadata.
-    pub async fn on_remove_metadata(
+    /// Deletes the table metadata **logically**.
+    pub async fn on_delete_metadata(
        &self,
        ctx: &DdlContext,
-        region_routes: &[RegionRoute],
+        table_route_value: &TableRouteValue,
+    ) -> Result<()> {
+        let table_name_key = TableNameKey::new(
+            &self.table.catalog_name,
+            &self.table.schema_name,
+            &self.table.table_name,
+        );
+        if !ctx
+            .table_metadata_manager
+            .table_name_manager()
+            .exists(table_name_key)
+            .await?
+        {
+            return Ok(());
+        }
+        ctx.table_metadata_manager
+            .delete_table_metadata(self.table_id, &self.table, table_route_value)
+            .await
+    }
+
+    /// Deletes the table metadata tombstone **permanently**.
+    pub async fn on_delete_metadata_tombstone(
+        &self,
+        ctx: &DdlContext,
+        table_route_value: &TableRouteValue,
    ) -> Result<()> {
        ctx.table_metadata_manager
-            .delete_table_metadata(self.table_id, &self.table, region_routes)
+            .delete_table_metadata_tombstone(self.table_id, &self.table, table_route_value)
+            .await
+    }
+
+    /// Deletes metadata for table **permanently**.
+    pub async fn on_destroy_metadata(
+        &self,
+        ctx: &DdlContext,
+        table_route_value: &TableRouteValue,
+    ) -> Result<()> {
+        ctx.table_metadata_manager
+            .destroy_table_metadata(self.table_id, &self.table, table_route_value)
+            .await
+    }
+
+    /// Restores the table metadata.
+    pub async fn on_restore_metadata(
+        &self,
+        ctx: &DdlContext,
+        table_route_value: &TableRouteValue,
+    ) -> Result<()> {
+        let table_name_key = TableNameKey::new(
+            &self.table.catalog_name,
+            &self.table.schema_name,
+            &self.table.table_name,
+        );
+        if ctx
+            .table_metadata_manager
+            .table_name_manager()
+            .exists(table_name_key)
+            .await?
+        {
+            return Ok(());
+        }
+        ctx.table_metadata_manager
+            .restore_table_metadata(self.table_id, &self.table, table_route_value)
            .await
    }

--- a/src/common/meta/src/ddl/drop_table/metadata.rs
+++ b/src/common/meta/src/ddl/drop_table/metadata.rs
@@ -0,0 +1,34 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::ddl::drop_table::DropTableProcedure;
+use crate::error::Result;
+
+impl DropTableProcedure {
+    /// Fetches the table info and physical table route.
+    pub(crate) async fn fill_table_metadata(&mut self) -> Result<()> {
+        let task = &self.data.task;
+        let (physical_table_id, physical_table_route_value) = self
+            .context
+            .table_metadata_manager
+            .table_route_manager()
+            .get_physical_table_route(task.table_id)
+            .await?;
+
+        self.data.physical_region_routes = physical_table_route_value.region_routes;
+        self.data.physical_table_id = Some(physical_table_id);
+
+        Ok(())
+    }
+}
--- a/src/common/meta/src/ddl/test_util.rs
+++ b/src/common/meta/src/ddl/test_util.rs
@@ -15,6 +15,7 @@
 pub mod alter_table;
 pub mod columns;
 pub mod create_table;
+pub mod datanode_handler;

 use std::collections::HashMap;

--- a/src/common/meta/src/ddl/test_util/alter_table.rs
+++ b/src/common/meta/src/ddl/test_util/alter_table.rs
@@ -28,7 +28,7 @@ pub struct TestAlterTableExpr {
    table_name: String,
    #[builder(setter(into))]
    add_columns: Vec<ColumnDef>,
-    #[builder(setter(into))]
+    #[builder(setter(into, strip_option))]
    new_table_name: Option<String>,
 }

--- a/src/common/meta/src/ddl/test_util/create_table.rs
+++ b/src/common/meta/src/ddl/test_util/create_table.rs
@@ -15,7 +15,8 @@
 use std::collections::HashMap;

 use api::v1::column_def::try_as_column_schema;
-use api::v1::{ColumnDef, CreateTableExpr, SemanticType};
+use api::v1::meta::Partition;
+use api::v1::{ColumnDataType, ColumnDef, CreateTableExpr, SemanticType};
 use chrono::DateTime;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, MITO2_ENGINE};
 use datatypes::schema::RawSchema;
@@ -24,6 +25,9 @@ use store_api::storage::TableId;
 use table::metadata::{RawTableInfo, RawTableMeta, TableIdent, TableType};
 use table::requests::TableOptions;

+use crate::ddl::test_util::columns::TestColumnDefBuilder;
+use crate::rpc::ddl::CreateTableTask;
+
 #[derive(Default, Builder)]
 #[builder(default)]
 pub struct TestCreateTableExpr {
@@ -43,6 +47,7 @@ pub struct TestCreateTableExpr {
    primary_keys: Vec<String>,
    create_if_not_exists: bool,
    table_options: HashMap<String, String>,
+    #[builder(setter(into, strip_option))]
    table_id: Option<TableId>,
    #[builder(setter(into), default = "MITO2_ENGINE.to_string()")]
    engine: String,
@@ -129,3 +134,47 @@ pub fn build_raw_table_info_from_expr(expr: &CreateTableExpr) -> RawTableInfo {
        table_type: TableType::Base,
    }
 }
+
+pub fn test_create_table_task(name: &str, table_id: TableId) -> CreateTableTask {
+    let create_table = TestCreateTableExprBuilder::default()
+        .column_defs([
+            TestColumnDefBuilder::default()
+                .name("ts")
+                .data_type(ColumnDataType::TimestampMillisecond)
+                .semantic_type(SemanticType::Timestamp)
+                .build()
+                .unwrap()
+                .into(),
+            TestColumnDefBuilder::default()
+                .name("host")
+                .data_type(ColumnDataType::String)
+                .semantic_type(SemanticType::Tag)
+                .build()
+                .unwrap()
+                .into(),
+            TestColumnDefBuilder::default()
+                .name("cpu")
+                .data_type(ColumnDataType::Float64)
+                .semantic_type(SemanticType::Field)
+                .build()
+                .unwrap()
+                .into(),
+        ])
+        .table_id(table_id)
+        .time_index("ts")
+        .primary_keys(["host".into()])
+        .table_name(name)
+        .build()
+        .unwrap()
+        .into();
+    let table_info = build_raw_table_info_from_expr(&create_table);
+    CreateTableTask {
+        create_table,
+        // Single region
+        partitions: vec![Partition {
+            column_list: vec![],
+            value_list: vec![],
+        }],
+        table_info,
+    }
+}
--- a/src/common/meta/src/ddl/test_util/datanode_handler.rs
+++ b/src/common/meta/src/ddl/test_util/datanode_handler.rs
@@ -0,0 +1,169 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::region::{QueryRequest, RegionRequest};
+use common_error::ext::{BoxedError, ErrorExt, StackError};
+use common_error::status_code::StatusCode;
+use common_recordbatch::SendableRecordBatchStream;
+use common_telemetry::debug;
+use snafu::{ResultExt, Snafu};
+use tokio::sync::mpsc;
+
+use crate::datanode_manager::HandleResponse;
+use crate::error::{self, Error, Result};
+use crate::peer::Peer;
+use crate::test_util::MockDatanodeHandler;
+
+#[async_trait::async_trait]
+impl MockDatanodeHandler for () {
+    async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> Result<HandleResponse> {
+        unreachable!()
+    }
+
+    async fn handle_query(
+        &self,
+        _peer: &Peer,
+        _request: QueryRequest,
+    ) -> Result<SendableRecordBatchStream> {
+        unreachable!()
+    }
+}
+
+#[derive(Clone)]
+pub struct DatanodeWatcher(pub mpsc::Sender<(Peer, RegionRequest)>);
+
+#[async_trait::async_trait]
+impl MockDatanodeHandler for DatanodeWatcher {
+    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
+        debug!("Returning Ok(0) for request: {request:?}, peer: {peer:?}");
+        self.0.send((peer.clone(), request)).await.unwrap();
+        Ok(HandleResponse::new(0))
+    }
+
+    async fn handle_query(
+        &self,
+        _peer: &Peer,
+        _request: QueryRequest,
+    ) -> Result<SendableRecordBatchStream> {
+        unreachable!()
+    }
+}
+
+#[derive(Clone)]
+pub struct RetryErrorDatanodeHandler;
+
+#[async_trait::async_trait]
+impl MockDatanodeHandler for RetryErrorDatanodeHandler {
+    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
+        debug!("Returning retry later for request: {request:?}, peer: {peer:?}");
+        Err(Error::RetryLater {
+            source: BoxedError::new(
+                error::UnexpectedSnafu {
+                    err_msg: "retry later",
+                }
+                .build(),
+            ),
+        })
+    }
+
+    async fn handle_query(
+        &self,
+        _peer: &Peer,
+        _request: QueryRequest,
+    ) -> Result<SendableRecordBatchStream> {
+        unreachable!()
+    }
+}
+
+#[derive(Clone)]
+pub struct UnexpectedErrorDatanodeHandler;
+
+#[async_trait::async_trait]
+impl MockDatanodeHandler for UnexpectedErrorDatanodeHandler {
+    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
+        debug!("Returning mock error for request: {request:?}, peer: {peer:?}");
+        error::UnexpectedSnafu {
+            err_msg: "mock error",
+        }
+        .fail()
+    }
+
+    async fn handle_query(
+        &self,
+        _peer: &Peer,
+        _request: QueryRequest,
+    ) -> Result<SendableRecordBatchStream> {
+        unreachable!()
+    }
+}
+
+#[derive(Clone)]
+pub struct RequestOutdatedErrorDatanodeHandler;
+
+#[derive(Debug, Snafu)]
+#[snafu(display("A mock RequestOutdated error"))]
+struct MockRequestOutdatedError;
+
+impl StackError for MockRequestOutdatedError {
+    fn debug_fmt(&self, _: usize, _: &mut Vec<String>) {}
+
+    fn next(&self) -> Option<&dyn StackError> {
+        None
+    }
+}
+
+impl ErrorExt for MockRequestOutdatedError {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn status_code(&self) -> StatusCode {
+        StatusCode::RequestOutdated
+    }
+}
+
+#[async_trait::async_trait]
+impl MockDatanodeHandler for RequestOutdatedErrorDatanodeHandler {
+    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
+        debug!("Returning mock error for request: {request:?}, peer: {peer:?}");
+        Err(BoxedError::new(MockRequestOutdatedError)).context(error::ExternalSnafu)
+    }
+
+    async fn handle_query(
+        &self,
+        _peer: &Peer,
+        _request: QueryRequest,
+    ) -> Result<SendableRecordBatchStream> {
+        unreachable!()
+    }
+}
+
+#[derive(Clone)]
+pub struct NaiveDatanodeHandler;
+
+#[async_trait::async_trait]
+impl MockDatanodeHandler for NaiveDatanodeHandler {
+    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
+        debug!("Returning Ok(0) for request: {request:?}, peer: {peer:?}");
+        Ok(HandleResponse::new(0))
+    }
+
+    async fn handle_query(
+        &self,
+        _peer: &Peer,
+        _request: QueryRequest,
+    ) -> Result<SendableRecordBatchStream> {
+        unreachable!()
+    }
+}
--- a/src/common/meta/src/ddl/tests.rs
+++ b/src/common/meta/src/ddl/tests.rs
@@ -13,6 +13,8 @@
 // limitations under the License.

 mod alter_logical_tables;
+mod alter_table;
 mod create_logical_tables;
 mod create_table;
 mod drop_database;
+mod drop_table;
--- a/src/common/meta/src/ddl/tests/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/tests/alter_logical_tables.rs
@@ -23,8 +23,8 @@ use common_procedure_test::MockContextProvider;
 use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
 use crate::ddl::test_util::alter_table::TestAlterTableExprBuilder;
 use crate::ddl::test_util::columns::TestColumnDefBuilder;
+use crate::ddl::test_util::datanode_handler::NaiveDatanodeHandler;
 use crate::ddl::test_util::{create_logical_table, create_physical_table};
-use crate::ddl::tests::create_logical_tables::NaiveDatanodeHandler;
 use crate::error::Error::{AlterLogicalTablesInvalidArguments, TableNotFound};
 use crate::key::table_name::TableNameKey;
 use crate::rpc::ddl::AlterTableTask;
--- a/src/common/meta/src/ddl/tests/alter_table.rs
+++ b/src/common/meta/src/ddl/tests/alter_table.rs
@@ -0,0 +1,345 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::assert_matches::assert_matches;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use api::v1::alter_expr::Kind;
+use api::v1::region::{region_request, RegionRequest};
+use api::v1::{
+    AddColumn, AddColumns, AlterExpr, ColumnDataType, ColumnDef as PbColumnDef, DropColumn,
+    DropColumns, SemanticType,
+};
+use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use store_api::storage::RegionId;
+use tokio::sync::mpsc::{self};
+
+use crate::ddl::alter_table::AlterTableProcedure;
+use crate::ddl::test_util::alter_table::TestAlterTableExprBuilder;
+use crate::ddl::test_util::create_table::test_create_table_task;
+use crate::ddl::test_util::datanode_handler::{
+    DatanodeWatcher, RequestOutdatedErrorDatanodeHandler,
+};
+use crate::key::table_name::TableNameKey;
+use crate::key::table_route::TableRouteValue;
+use crate::peer::Peer;
+use crate::rpc::ddl::AlterTableTask;
+use crate::rpc::router::{Region, RegionRoute};
+use crate::test_util::{new_ddl_context, MockDatanodeManager};
+
+fn test_rename_alter_table_task(table_name: &str, new_table_name: &str) -> AlterTableTask {
+    let builder = TestAlterTableExprBuilder::default()
+        .table_name(table_name)
+        .new_table_name(new_table_name)
+        .build()
+        .unwrap();
+
+    AlterTableTask {
+        alter_table: builder.into(),
+    }
+}
+
+#[tokio::test]
+async fn test_on_prepare_table_exists_err() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(()));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let task = test_create_table_task("foo", 1024);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let task = test_rename_alter_table_task("non-exists", "foo");
+    let mut procedure = AlterTableProcedure::new(cluster_id, 1024, task, ddl_context).unwrap();
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err.status_code(), StatusCode::TableAlreadyExists);
+}
+
+#[tokio::test]
+async fn test_on_prepare_table_not_exists_err() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(()));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let task = test_rename_alter_table_task("non-exists", "foo");
+    let mut procedure = AlterTableProcedure::new(cluster_id, 1024, task, ddl_context).unwrap();
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err.status_code(), StatusCode::TableNotFound);
+}
+
+#[tokio::test]
+async fn test_on_submit_alter_request() {
+    let (tx, mut rx) = mpsc::channel(8);
+    let datanode_handler = DatanodeWatcher(tx);
+    let datanode_manager = Arc::new(MockDatanodeManager::new(datanode_handler));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let table_id = 1024;
+    let table_name = "foo";
+    let task = test_create_table_task(table_name, table_id);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 1)),
+                    leader_peer: Some(Peer::empty(1)),
+                    follower_peers: vec![Peer::empty(5)],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 2)),
+                    leader_peer: Some(Peer::empty(2)),
+                    follower_peers: vec![Peer::empty(4)],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 3)),
+                    leader_peer: Some(Peer::empty(3)),
+                    follower_peers: vec![],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+            ]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let alter_table_task = AlterTableTask {
+        alter_table: AlterExpr {
+            catalog_name: DEFAULT_CATALOG_NAME.to_string(),
+            schema_name: DEFAULT_SCHEMA_NAME.to_string(),
+            table_name: table_name.to_string(),
+            kind: Some(Kind::DropColumns(DropColumns {
+                drop_columns: vec![DropColumn {
+                    name: "my_field_column".to_string(),
+                }],
+            })),
+        },
+    };
+    let mut procedure =
+        AlterTableProcedure::new(cluster_id, table_id, alter_table_task, ddl_context).unwrap();
+    procedure.on_prepare().await.unwrap();
+    procedure.submit_alter_region_requests().await.unwrap();
+
+    let check = |peer: Peer,
+                 request: RegionRequest,
+                 expected_peer_id: u64,
+                 expected_region_id: RegionId| {
+        assert_eq!(peer.id, expected_peer_id);
+        let Some(region_request::Body::Alter(req)) = request.body else {
+            unreachable!();
+        };
+        assert_eq!(req.region_id, expected_region_id);
+    };
+
+    let mut results = Vec::new();
+    for _ in 0..3 {
+        let result = rx.try_recv().unwrap();
+        results.push(result);
+    }
+    results.sort_unstable_by(|(a, _), (b, _)| a.id.cmp(&b.id));
+
+    let (peer, request) = results.remove(0);
+    check(peer, request, 1, RegionId::new(table_id, 1));
+    let (peer, request) = results.remove(0);
+    check(peer, request, 2, RegionId::new(table_id, 2));
+    let (peer, request) = results.remove(0);
+    check(peer, request, 3, RegionId::new(table_id, 3));
+}
+
+#[tokio::test]
+async fn test_on_submit_alter_request_with_outdated_request() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(
+        RequestOutdatedErrorDatanodeHandler,
+    ));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let table_id = 1024;
+    let table_name = "foo";
+    let task = test_create_table_task(table_name, table_id);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 1)),
+                    leader_peer: Some(Peer::empty(1)),
+                    follower_peers: vec![Peer::empty(5)],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 2)),
+                    leader_peer: Some(Peer::empty(2)),
+                    follower_peers: vec![Peer::empty(4)],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 3)),
+                    leader_peer: Some(Peer::empty(3)),
+                    follower_peers: vec![],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+            ]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let alter_table_task = AlterTableTask {
+        alter_table: AlterExpr {
+            catalog_name: DEFAULT_CATALOG_NAME.to_string(),
+            schema_name: DEFAULT_SCHEMA_NAME.to_string(),
+            table_name: table_name.to_string(),
+            kind: Some(Kind::DropColumns(DropColumns {
+                drop_columns: vec![DropColumn {
+                    name: "my_field_column".to_string(),
+                }],
+            })),
+        },
+    };
+    let mut procedure =
+        AlterTableProcedure::new(cluster_id, table_id, alter_table_task, ddl_context).unwrap();
+    procedure.on_prepare().await.unwrap();
+    procedure.submit_alter_region_requests().await.unwrap();
+}
+
+#[tokio::test]
+async fn test_on_update_metadata_rename() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(()));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let table_name = "foo";
+    let new_table_name = "bar";
+    let table_id = 1024;
+    let task = test_create_table_task(table_name, table_id);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let task = test_rename_alter_table_task(table_name, new_table_name);
+    let mut procedure =
+        AlterTableProcedure::new(cluster_id, table_id, task, ddl_context.clone()).unwrap();
+    procedure.on_prepare().await.unwrap();
+    procedure.on_update_metadata().await.unwrap();
+
+    let old_table_name_exists = ddl_context
+        .table_metadata_manager
+        .table_name_manager()
+        .exists(TableNameKey::new(
+            DEFAULT_CATALOG_NAME,
+            DEFAULT_SCHEMA_NAME,
+            table_name,
+        ))
+        .await
+        .unwrap();
+    assert!(!old_table_name_exists);
+    let value = ddl_context
+        .table_metadata_manager
+        .table_name_manager()
+        .get(TableNameKey::new(
+            DEFAULT_CATALOG_NAME,
+            DEFAULT_SCHEMA_NAME,
+            new_table_name,
+        ))
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(value.table_id(), table_id);
+}
+
+#[tokio::test]
+async fn test_on_update_metadata_add_columns() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(()));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let table_name = "foo";
+    let table_id = 1024;
+    let task = test_create_table_task(table_name, table_id);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let task = AlterTableTask {
+        alter_table: AlterExpr {
+            catalog_name: DEFAULT_CATALOG_NAME.to_string(),
+            schema_name: DEFAULT_SCHEMA_NAME.to_string(),
+            table_name: table_name.to_string(),
+            kind: Some(Kind::AddColumns(AddColumns {
+                add_columns: vec![AddColumn {
+                    column_def: Some(PbColumnDef {
+                        name: "my_tag3".to_string(),
+                        data_type: ColumnDataType::String as i32,
+                        semantic_type: SemanticType::Tag as i32,
+                        is_nullable: true,
+                        ..Default::default()
+                    }),
+                    location: None,
+                }],
+            })),
+        },
+    };
+    let mut procedure =
+        AlterTableProcedure::new(cluster_id, table_id, task, ddl_context.clone()).unwrap();
+    procedure.on_prepare().await.unwrap();
+    procedure.on_update_metadata().await.unwrap();
+
+    let table_info = ddl_context
+        .table_metadata_manager
+        .table_info_manager()
+        .get(table_id)
+        .await
+        .unwrap()
+        .unwrap()
+        .into_inner()
+        .table_info;
+
+    assert_eq!(
+        table_info.meta.schema.column_schemas.len() as u32,
+        table_info.meta.next_column_id
+    );
+}
--- a/src/common/meta/src/ddl/tests/create_logical_tables.rs
+++ b/src/common/meta/src/ddl/tests/create_logical_tables.rs
@@ -15,25 +15,21 @@
 use std::assert_matches::assert_matches;
 use std::sync::Arc;

-use api::v1::region::{QueryRequest, RegionRequest};
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId, Status};
 use common_procedure_test::MockContextProvider;
-use common_recordbatch::SendableRecordBatchStream;
-use common_telemetry::debug;
 use store_api::storage::RegionId;

-use crate::datanode_manager::HandleResponse;
 use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
+use crate::ddl::test_util::datanode_handler::NaiveDatanodeHandler;
 use crate::ddl::test_util::{
    create_physical_table_metadata, test_create_logical_table_task, test_create_physical_table_task,
 };
 use crate::ddl::{TableMetadata, TableMetadataAllocatorContext};
-use crate::error::{Error, Result};
+use crate::error::Error;
 use crate::key::table_route::TableRouteValue;
-use crate::peer::Peer;
-use crate::test_util::{new_ddl_context, MockDatanodeHandler, MockDatanodeManager};
+use crate::test_util::{new_ddl_context, MockDatanodeManager};

 #[tokio::test]
 async fn test_on_prepare_physical_table_not_found() {
@@ -229,25 +225,6 @@ async fn test_on_prepare_part_logical_tables_exist() {
    assert_matches!(status, Status::Executing { persist: true });
 }

-#[derive(Clone)]
-pub struct NaiveDatanodeHandler;
-
-#[async_trait::async_trait]
-impl MockDatanodeHandler for NaiveDatanodeHandler {
-    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
-        debug!("Returning Ok(0) for request: {request:?}, peer: {peer:?}");
-        Ok(HandleResponse::new(0))
-    }
-
-    async fn handle_query(
-        &self,
-        _peer: &Peer,
-        _request: QueryRequest,
-    ) -> Result<SendableRecordBatchStream> {
-        unreachable!()
-    }
-}
-
 #[tokio::test]
 async fn test_on_create_metadata() {
    let datanode_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
--- a/src/common/meta/src/ddl/tests/create_table.rs
+++ b/src/common/meta/src/ddl/tests/create_table.rs
@@ -17,42 +17,24 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use api::v1::meta::Partition;
-use api::v1::region::{QueryRequest, RegionRequest};
 use api::v1::{ColumnDataType, SemanticType};
-use common_error::ext::{BoxedError, ErrorExt};
+use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId, Status};
 use common_procedure_test::MockContextProvider;
-use common_recordbatch::SendableRecordBatchStream;
-use common_telemetry::debug;

-use crate::datanode_manager::HandleResponse;
 use crate::ddl::create_table::CreateTableProcedure;
 use crate::ddl::test_util::columns::TestColumnDefBuilder;
 use crate::ddl::test_util::create_table::{
    build_raw_table_info_from_expr, TestCreateTableExprBuilder,
 };
-use crate::error;
-use crate::error::{Error, Result};
+use crate::ddl::test_util::datanode_handler::{
+    NaiveDatanodeHandler, RetryErrorDatanodeHandler, UnexpectedErrorDatanodeHandler,
+};
+use crate::error::Error;
 use crate::key::table_route::TableRouteValue;
-use crate::peer::Peer;
 use crate::rpc::ddl::CreateTableTask;
-use crate::test_util::{new_ddl_context, MockDatanodeHandler, MockDatanodeManager};
-
-#[async_trait::async_trait]
-impl MockDatanodeHandler for () {
-    async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> Result<HandleResponse> {
-        unreachable!()
-    }
-
-    async fn handle_query(
-        &self,
-        _peer: &Peer,
-        _request: QueryRequest,
-    ) -> Result<SendableRecordBatchStream> {
-        unreachable!()
-    }
-}
+use crate::test_util::{new_ddl_context, MockDatanodeManager};

 fn test_create_table_task(name: &str) -> CreateTableTask {
    let create_table = TestCreateTableExprBuilder::default()
@@ -174,32 +156,6 @@ async fn test_on_prepare_with_no_partition_err() {
        .contains("The number of partitions must be greater than 0"),);
 }

-#[derive(Clone)]
-pub struct RetryErrorDatanodeHandler;
-
-#[async_trait::async_trait]
-impl MockDatanodeHandler for RetryErrorDatanodeHandler {
-    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
-        debug!("Returning retry later for request: {request:?}, peer: {peer:?}");
-        Err(Error::RetryLater {
-            source: BoxedError::new(
-                error::UnexpectedSnafu {
-                    err_msg: "retry later",
-                }
-                .build(),
-            ),
-        })
-    }
-
-    async fn handle_query(
-        &self,
-        _peer: &Peer,
-        _request: QueryRequest,
-    ) -> Result<SendableRecordBatchStream> {
-        unreachable!()
-    }
-}
-
 #[tokio::test]
 async fn test_on_datanode_create_regions_should_retry() {
    common_telemetry::init_default_ut_logging();
@@ -218,28 +174,6 @@ async fn test_on_datanode_create_regions_should_retry() {
    assert!(error.is_retry_later());
 }

-#[derive(Clone)]
-pub struct UnexpectedErrorDatanodeHandler;
-
-#[async_trait::async_trait]
-impl MockDatanodeHandler for UnexpectedErrorDatanodeHandler {
-    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
-        debug!("Returning mock error for request: {request:?}, peer: {peer:?}");
-        error::UnexpectedSnafu {
-            err_msg: "mock error",
-        }
-        .fail()
-    }
-
-    async fn handle_query(
-        &self,
-        _peer: &Peer,
-        _request: QueryRequest,
-    ) -> Result<SendableRecordBatchStream> {
-        unreachable!()
-    }
-}
-
 #[tokio::test]
 async fn test_on_datanode_create_regions_should_not_retry() {
    common_telemetry::init_default_ut_logging();
@@ -258,25 +192,6 @@ async fn test_on_datanode_create_regions_should_not_retry() {
    assert!(!error.is_retry_later());
 }

-#[derive(Clone)]
-pub struct NaiveDatanodeHandler;
-
-#[async_trait::async_trait]
-impl MockDatanodeHandler for NaiveDatanodeHandler {
-    async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
-        debug!("Returning Ok(0) for request: {request:?}, peer: {peer:?}");
-        Ok(HandleResponse::new(0))
-    }
-
-    async fn handle_query(
-        &self,
-        _peer: &Peer,
-        _request: QueryRequest,
-    ) -> Result<SendableRecordBatchStream> {
-        unreachable!()
-    }
-}
-
 #[tokio::test]
 async fn test_on_create_metadata_error() {
    common_telemetry::init_default_ut_logging();
--- a/src/common/meta/src/ddl/tests/drop_database.rs
+++ b/src/common/meta/src/ddl/tests/drop_database.rs
@@ -20,8 +20,8 @@ use common_procedure_test::MockContextProvider;
 use futures::TryStreamExt;

 use crate::ddl::drop_database::DropDatabaseProcedure;
+use crate::ddl::test_util::datanode_handler::{NaiveDatanodeHandler, RetryErrorDatanodeHandler};
 use crate::ddl::test_util::{create_logical_table, create_physical_table};
-use crate::ddl::tests::create_table::{NaiveDatanodeHandler, RetryErrorDatanodeHandler};
 use crate::key::schema_name::SchemaNameKey;
 use crate::test_util::{new_ddl_context, MockDatanodeManager};

--- a/src/common/meta/src/ddl/tests/drop_table.rs
+++ b/src/common/meta/src/ddl/tests/drop_table.rs
@@ -0,0 +1,291 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use api::v1::region::{region_request, RegionRequest};
+use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId};
+use common_procedure_test::MockContextProvider;
+use store_api::storage::RegionId;
+use tokio::sync::mpsc;
+
+use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
+use crate::ddl::drop_table::DropTableProcedure;
+use crate::ddl::test_util::create_table::test_create_table_task;
+use crate::ddl::test_util::datanode_handler::{DatanodeWatcher, NaiveDatanodeHandler};
+use crate::ddl::test_util::{
+    create_physical_table_metadata, test_create_logical_table_task, test_create_physical_table_task,
+};
+use crate::ddl::{TableMetadata, TableMetadataAllocatorContext};
+use crate::key::table_route::TableRouteValue;
+use crate::kv_backend::memory::MemoryKvBackend;
+use crate::peer::Peer;
+use crate::rpc::ddl::DropTableTask;
+use crate::rpc::router::{Region, RegionRoute};
+use crate::test_util::{new_ddl_context, new_ddl_context_with_kv_backend, MockDatanodeManager};
+
+#[tokio::test]
+async fn test_on_prepare_table_not_exists_err() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(()));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let table_name = "foo";
+    let table_id = 1024;
+    let task = test_create_table_task(table_name, table_id);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let task = DropTableTask {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table: "bar".to_string(),
+        table_id,
+        drop_if_exists: false,
+    };
+
+    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::TableNotFound);
+}
+
+#[tokio::test]
+async fn test_on_prepare_table() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(()));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let table_name = "foo";
+    let table_id = 1024;
+    let task = test_create_table_task(table_name, table_id);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let task = DropTableTask {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table: "bar".to_string(),
+        table_id,
+        drop_if_exists: true,
+    };
+
+    // Drop if exists
+    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+    procedure.on_prepare().await.unwrap();
+
+    let task = DropTableTask {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table: table_name.to_string(),
+        table_id,
+        drop_if_exists: false,
+    };
+
+    // Drop table
+    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
+    procedure.on_prepare().await.unwrap();
+}
+
+#[tokio::test]
+async fn test_on_datanode_drop_regions() {
+    let (tx, mut rx) = mpsc::channel(8);
+    let datanode_handler = DatanodeWatcher(tx);
+    let datanode_manager = Arc::new(MockDatanodeManager::new(datanode_handler));
+    let ddl_context = new_ddl_context(datanode_manager);
+    let cluster_id = 1;
+    let table_id = 1024;
+    let table_name = "foo";
+    let task = test_create_table_task(table_name, table_id);
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(vec![
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 1)),
+                    leader_peer: Some(Peer::empty(1)),
+                    follower_peers: vec![Peer::empty(5)],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 2)),
+                    leader_peer: Some(Peer::empty(2)),
+                    follower_peers: vec![Peer::empty(4)],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+                RegionRoute {
+                    region: Region::new_test(RegionId::new(table_id, 3)),
+                    leader_peer: Some(Peer::empty(3)),
+                    follower_peers: vec![],
+                    leader_status: None,
+                    leader_down_since: None,
+                },
+            ]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let task = DropTableTask {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table: table_name.to_string(),
+        table_id,
+        drop_if_exists: false,
+    };
+    // Drop table
+    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
+    procedure.on_prepare().await.unwrap();
+    procedure.on_datanode_drop_regions().await.unwrap();
+
+    let check = |peer: Peer,
+                 request: RegionRequest,
+                 expected_peer_id: u64,
+                 expected_region_id: RegionId| {
+        assert_eq!(peer.id, expected_peer_id);
+        let Some(region_request::Body::Drop(req)) = request.body else {
+            unreachable!();
+        };
+        assert_eq!(req.region_id, expected_region_id);
+    };
+
+    let mut results = Vec::new();
+    for _ in 0..3 {
+        let result = rx.try_recv().unwrap();
+        results.push(result);
+    }
+    results.sort_unstable_by(|(a, _), (b, _)| a.id.cmp(&b.id));
+
+    let (peer, request) = results.remove(0);
+    check(peer, request, 1, RegionId::new(table_id, 1));
+    let (peer, request) = results.remove(0);
+    check(peer, request, 2, RegionId::new(table_id, 2));
+    let (peer, request) = results.remove(0);
+    check(peer, request, 3, RegionId::new(table_id, 3));
+}
+
+#[tokio::test]
+async fn test_on_rollback() {
+    let datanode_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+    let kv_backend = Arc::new(MemoryKvBackend::new());
+    let ddl_context = new_ddl_context_with_kv_backend(datanode_manager, kv_backend.clone());
+    let cluster_id = 1;
+    // Prepares physical table metadata.
+    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
+    let TableMetadata {
+        table_id,
+        table_route,
+        ..
+    } = ddl_context
+        .table_metadata_allocator
+        .create(
+            &TableMetadataAllocatorContext { cluster_id },
+            &create_physical_table_task,
+        )
+        .await
+        .unwrap();
+    create_physical_table_task.set_table_id(table_id);
+    create_physical_table_metadata(
+        &ddl_context,
+        create_physical_table_task.table_info.clone(),
+        TableRouteValue::Physical(table_route),
+    )
+    .await;
+    // The create logical table procedure.
+    let physical_table_id = table_id;
+    // Creates the logical table metadata.
+    let task = test_create_logical_table_task("foo");
+    let mut procedure = CreateLogicalTablesProcedure::new(
+        cluster_id,
+        vec![task],
+        physical_table_id,
+        ddl_context.clone(),
+    );
+    procedure.on_prepare().await.unwrap();
+    let ctx = ProcedureContext {
+        procedure_id: ProcedureId::random(),
+        provider: Arc::new(MockContextProvider::default()),
+    };
+    procedure.execute(&ctx).await.unwrap();
+    // Triggers procedure to create table metadata
+    let status = procedure.execute(&ctx).await.unwrap();
+    let table_ids = status.downcast_output_ref::<Vec<u32>>().unwrap();
+    assert_eq!(*table_ids, vec![1025]);
+
+    let expected_kvs = kv_backend.dump();
+    // Drops the physical table
+    {
+        let task = DropTableTask {
+            catalog: DEFAULT_CATALOG_NAME.to_string(),
+            schema: DEFAULT_SCHEMA_NAME.to_string(),
+            table: "phy_table".to_string(),
+            table_id: physical_table_id,
+            drop_if_exists: false,
+        };
+        let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+        procedure.on_prepare().await.unwrap();
+        procedure.on_delete_metadata().await.unwrap();
+        let ctx = ProcedureContext {
+            procedure_id: ProcedureId::random(),
+            provider: Arc::new(MockContextProvider::default()),
+        };
+        procedure.rollback(&ctx).await.unwrap();
+        // Rollback again
+        procedure.rollback(&ctx).await.unwrap();
+        let kvs = kv_backend.dump();
+        assert_eq!(kvs, expected_kvs);
+    }
+
+    // Drops the logical table
+    let task = DropTableTask {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table: "foo".to_string(),
+        table_id: table_ids[0],
+        drop_if_exists: false,
+    };
+    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+    procedure.on_prepare().await.unwrap();
+    procedure.on_delete_metadata().await.unwrap();
+    let ctx = ProcedureContext {
+        procedure_id: ProcedureId::random(),
+        provider: Arc::new(MockContextProvider::default()),
+    };
+    procedure.rollback(&ctx).await.unwrap();
+    // Rollback again
+    procedure.rollback(&ctx).await.unwrap();
+    let kvs = kv_backend.dump();
+    assert_eq!(kvs, expected_kvs);
+}
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -42,7 +42,6 @@ use crate::error::{
 };
 use crate::key::table_info::TableInfoValue;
 use crate::key::table_name::TableNameKey;
-use crate::key::table_route::TableRouteValue;
 use crate::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
 use crate::region_keeper::MemoryRegionKeeperRef;
 use crate::rpc::ddl::DdlTask::{
@@ -206,13 +205,12 @@ impl DdlManager {
    pub async fn submit_alter_table_task(
        &self,
        cluster_id: ClusterId,
+        table_id: TableId,
        alter_table_task: AlterTableTask,
-        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure =
-            AlterTableProcedure::new(cluster_id, alter_table_task, table_info_value, context)?;
+        let procedure = AlterTableProcedure::new(cluster_id, table_id, alter_table_task, context)?;

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -285,18 +283,10 @@ impl DdlManager {
        &self,
        cluster_id: ClusterId,
        drop_table_task: DropTableTask,
-        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
-        table_route_value: DeserializedValueWithBytes<TableRouteValue>,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure = DropTableProcedure::new(
-            cluster_id,
-            drop_table_task,
-            table_route_value,
-            table_info_value,
-            context,
-        );
+        let procedure = DropTableProcedure::new(cluster_id, drop_table_task, context);

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -442,12 +432,12 @@ async fn handle_alter_table_task(
        })?
        .table_id();

-    let (table_info_value, table_route_value) = ddl_manager
+    let table_route_value = ddl_manager
        .table_metadata_manager()
-        .get_full_table_info(table_id)
-        .await?;
-
-    let table_route_value = table_route_value
+        .table_route_manager()
+        .table_route_storage()
+        .get_raw(table_id)
+        .await?
        .context(TableRouteNotFoundSnafu { table_id })?
        .into_inner();

@@ -458,12 +448,8 @@ async fn handle_alter_table_task(
        }
    );

-    let table_info_value = table_info_value.with_context(|| TableInfoNotFoundSnafu {
-        table: table_ref.to_string(),
-    })?;
-
    let (id, _) = ddl_manager
-        .submit_alter_table_task(cluster_id, alter_table_task, table_info_value)
+        .submit_alter_table_task(cluster_id, table_id, alter_table_task)
        .await?;

    info!("Table: {table_id} is altered via procedure_id {id:?}");
@@ -480,32 +466,8 @@ async fn handle_drop_table_task(
    drop_table_task: DropTableTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let table_id = drop_table_task.table_id;
-    let table_metadata_manager = &ddl_manager.table_metadata_manager();
-    let table_ref = drop_table_task.table_ref();
-
-    let table_info_value = table_metadata_manager
-        .table_info_manager()
-        .get(table_id)
-        .await?;
-    let (_, table_route_value) = table_metadata_manager
-        .table_route_manager()
-        .get_physical_table_route(table_id)
-        .await?;
-
-    let table_info_value = table_info_value.with_context(|| TableInfoNotFoundSnafu {
-        table: table_ref.to_string(),
-    })?;
-
-    let table_route_value =
-        DeserializedValueWithBytes::from_inner(TableRouteValue::Physical(table_route_value));
-
    let (id, _) = ddl_manager
-        .submit_drop_table_task(
-            cluster_id,
-            drop_table_task,
-            table_info_value,
-            table_route_value,
-        )
+        .submit_drop_table_task(cluster_id, drop_table_task)
        .await?;

    info!("Table: {table_id} is dropped via procedure_id {id:?}");
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -421,6 +421,9 @@ pub enum Error {
    #[snafu(display("Invalid role: {}", role))]
    InvalidRole { role: i32, location: Location },

+    #[snafu(display("Atomic key changed: {err_msg}"))]
+    CasKeyChanged { err_msg: String, location: Location },
+
    #[snafu(display("Failed to parse {} from utf8", name))]
    FromUtf8 {
        name: String,
@@ -440,7 +443,8 @@ impl ErrorExt for Error {
            | EtcdTxnOpResponse { .. }
            | EtcdFailed { .. }
            | EtcdTxnFailed { .. }
-            | ConnectEtcd { .. } => StatusCode::Internal,
+            | ConnectEtcd { .. }
+            | CasKeyChanged { .. } => StatusCode::Internal,

            SerdeJson { .. }
            | ParseOption { .. }
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -56,9 +56,12 @@ pub mod table_region;
 pub mod table_route;
 #[cfg(any(test, feature = "testing"))]
 pub mod test_utils;
+// TODO(weny): remove it.
+#[allow(dead_code)]
+mod tombstone;
 mod txn_helper;

-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::fmt::Debug;
 use std::ops::Deref;
 use std::sync::Arc;
@@ -83,9 +86,13 @@ use self::catalog_name::{CatalogManager, CatalogNameKey, CatalogNameValue};
 use self::datanode_table::RegionInfo;
 use self::schema_name::{SchemaManager, SchemaNameKey, SchemaNameValue};
 use self::table_route::{TableRouteManager, TableRouteValue};
+use self::tombstone::TombstoneManager;
 use crate::ddl::utils::region_storage_path;
-use crate::error::{self, Result, SerdeJsonSnafu};
-use crate::kv_backend::txn::{Txn, TxnOpResponse};
+use crate::error::{self, Result, SerdeJsonSnafu, UnexpectedSnafu};
+use crate::key::table_route::TableRouteKey;
+use crate::key::tombstone::Key;
+use crate::key::txn_helper::TxnOpGetResponseSet;
+use crate::kv_backend::txn::{Txn, TxnOp, TxnOpResponse};
 use crate::kv_backend::KvBackendRef;
 use crate::rpc::router::{region_distribution, RegionRoute, RegionStatus};
 use crate::table_name::TableName;
@@ -97,7 +104,6 @@ pub const MAINTENANCE_KEY: &str = "maintenance";
 const DATANODE_TABLE_KEY_PREFIX: &str = "__dn_table";
 const TABLE_REGION_KEY_PREFIX: &str = "__table_region";

-pub const REMOVED_PREFIX: &str = "__removed";
 pub const TABLE_INFO_KEY_PREFIX: &str = "__table_info";
 pub const TABLE_NAME_KEY_PREFIX: &str = "__table_name";
 pub const CATALOG_NAME_KEY_PREFIX: &str = "__catalog_name";
@@ -145,6 +151,33 @@ pub trait TableMetaKey {
    fn as_raw_key(&self) -> Vec<u8>;
 }

+pub(crate) trait TableMetaKeyGetTxnOp {
+    fn build_get_op(
+        &self,
+    ) -> (
+        TxnOp,
+        impl for<'a> FnMut(&'a mut TxnOpGetResponseSet) -> Option<Vec<u8>>,
+    );
+}
+
+impl TableMetaKey for String {
+    fn as_raw_key(&self) -> Vec<u8> {
+        self.as_bytes().to_vec()
+    }
+}
+
+impl TableMetaKeyGetTxnOp for String {
+    fn build_get_op(
+        &self,
+    ) -> (
+        TxnOp,
+        impl for<'a> FnMut(&'a mut TxnOpGetResponseSet) -> Option<Vec<u8>>,
+    ) {
+        let key = self.as_raw_key();
+        (TxnOp::Get(key.clone()), TxnOpGetResponseSet::filter(key))
+    }
+}
+
 pub trait TableMetaValue {
    fn try_from_raw_value(raw_value: &[u8]) -> Result<Self>
    where
@@ -162,6 +195,7 @@ pub struct TableMetadataManager {
    catalog_manager: CatalogManager,
    schema_manager: SchemaManager,
    table_route_manager: TableRouteManager,
+    tombstone_manager: TombstoneManager,
    kv_backend: KvBackendRef,
 }

@@ -283,7 +317,7 @@ impl<T: Serialize + DeserializeOwned + TableMetaValue> DeserializedValueWithByte
        self.bytes.to_vec()
    }

-    /// Notes: used for test purpose.
+    #[cfg(any(test, feature = "testing"))]
    pub fn from_inner(inner: T) -> Self {
        let bytes = serde_json::to_vec(&inner).unwrap();

@@ -303,6 +337,7 @@ impl TableMetadataManager {
            catalog_manager: CatalogManager::new(kv_backend.clone()),
            schema_manager: SchemaManager::new(kv_backend.clone()),
            table_route_manager: TableRouteManager::new(kv_backend.clone()),
+            tombstone_manager: TombstoneManager::new(kv_backend.clone()),
            kv_backend,
        }
    }
@@ -363,19 +398,16 @@ impl TableMetadataManager {
        Option<DeserializedValueWithBytes<TableInfoValue>>,
        Option<DeserializedValueWithBytes<TableRouteValue>>,
    )> {
-        let (get_table_route_txn, table_route_decoder) = self
-            .table_route_manager
-            .table_route_storage()
-            .build_get_txn(table_id);
-        let (get_table_info_txn, table_info_decoder) =
-            self.table_info_manager.build_get_txn(table_id);
-
-        let txn = Txn::merge_all(vec![get_table_route_txn, get_table_info_txn]);
-        let res = self.kv_backend.txn(txn).await?;
-
-        let table_info_value = table_info_decoder(&res.responses)?;
-        let table_route_value = table_route_decoder(&res.responses)?;
+        let table_info_key = TableInfoKey::new(table_id);
+        let table_route_key = TableRouteKey::new(table_id);
+        let (table_info_txn, table_info_filter) = table_info_key.build_get_op();
+        let (table_route_txn, table_route_filter) = table_route_key.build_get_op();

+        let txn = Txn::new().and_then(vec![table_info_txn, table_route_txn]);
+        let mut res = self.kv_backend.txn(txn).await?;
+        let mut set = TxnOpGetResponseSet::from(&mut res.responses);
+        let table_info_value = TxnOpGetResponseSet::decode_with(table_info_filter)(&mut set)?;
+        let table_route_value = TxnOpGetResponseSet::decode_with(table_route_filter)(&mut set)?;
        Ok((table_info_value, table_route_value))
    }

@@ -545,47 +577,106 @@ impl TableMetadataManager {
        Ok(())
    }

-    /// Deletes metadata for table.
-    /// The caller MUST ensure it has the exclusive access to `TableNameKey`.
-    pub async fn delete_table_metadata(
+    fn table_metadata_keys(
        &self,
        table_id: TableId,
        table_name: &TableName,
-        region_routes: &[RegionRoute],
-    ) -> Result<()> {
-        // Deletes table name.
+        table_route_value: &TableRouteValue,
+    ) -> Result<Vec<Key>> {
+        // Builds keys
+        let datanode_ids = if table_route_value.is_physical() {
+            region_distribution(table_route_value.region_routes()?)
+                .into_keys()
+                .collect()
+        } else {
+            vec![]
+        };
+        let mut keys = Vec::with_capacity(3 + datanode_ids.len());
        let table_name = TableNameKey::new(
            &table_name.catalog_name,
            &table_name.schema_name,
            &table_name.table_name,
        );
+        let table_info_key = TableInfoKey::new(table_id);
+        let table_route_key = TableRouteKey::new(table_id);
+        let datanode_table_keys = datanode_ids
+            .into_iter()
+            .map(|datanode_id| DatanodeTableKey::new(datanode_id, table_id))
+            .collect::<HashSet<_>>();

-        let delete_table_name_txn = self.table_name_manager().build_delete_txn(&table_name)?;
+        keys.push(Key::compare_and_swap(table_name.as_raw_key()));
+        keys.push(Key::new(table_info_key.as_raw_key()));
+        keys.push(Key::new(table_route_key.as_raw_key()));
+        for key in &datanode_table_keys {
+            keys.push(Key::new(key.as_raw_key()));
+        }
+        Ok(keys)
+    }

-        // Deletes table info.
-        let delete_table_info_txn = self.table_info_manager().build_delete_txn(table_id)?;
+    /// Deletes metadata for table **logically**.
+    /// The caller MUST ensure it has the exclusive access to `TableNameKey`.
+    pub async fn delete_table_metadata(
+        &self,
+        table_id: TableId,
+        table_name: &TableName,
+        table_route_value: &TableRouteValue,
+    ) -> Result<()> {
+        let keys = self.table_metadata_keys(table_id, table_name, table_route_value)?;
+        self.tombstone_manager.create(keys).await?;
+        Ok(())
+    }

-        // Deletes datanode table key value pairs.
-        let distribution = region_distribution(region_routes);
-        let delete_datanode_txn = self
-            .datanode_table_manager()
-            .build_delete_txn(table_id, distribution)?;
+    /// Deletes metadata tombstone for table **permanently**.
+    /// The caller MUST ensure it has the exclusive access to `TableNameKey`.
+    pub async fn delete_table_metadata_tombstone(
+        &self,
+        table_id: TableId,
+        table_name: &TableName,
+        table_route_value: &TableRouteValue,
+    ) -> Result<()> {
+        let keys = self
+            .table_metadata_keys(table_id, table_name, table_route_value)?
+            .into_iter()
+            .map(|key| key.into_bytes())
+            .collect::<Vec<_>>();
+        self.tombstone_manager.delete(keys).await
+    }

-        // Deletes table route.
-        let delete_table_route_txn = self
-            .table_route_manager()
-            .table_route_storage()
-            .build_delete_txn(table_id)?;
+    /// Restores metadata for table.
+    /// The caller MUST ensure it has the exclusive access to `TableNameKey`.
+    pub async fn restore_table_metadata(
+        &self,
+        table_id: TableId,
+        table_name: &TableName,
+        table_route_value: &TableRouteValue,
+    ) -> Result<()> {
+        let keys = self.table_metadata_keys(table_id, table_name, table_route_value)?;
+        self.tombstone_manager.restore(keys).await?;
+        Ok(())
+    }

-        let txn = Txn::merge_all(vec![
-            delete_table_name_txn,
-            delete_table_info_txn,
-            delete_datanode_txn,
-            delete_table_route_txn,
-        ]);
+    /// Deletes metadata for table **permanently**.
+    /// The caller MUST ensure it has the exclusive access to `TableNameKey`.
+    pub async fn destroy_table_metadata(
+        &self,
+        table_id: TableId,
+        table_name: &TableName,
+        table_route_value: &TableRouteValue,
+    ) -> Result<()> {
+        let operations = self
+            .table_metadata_keys(table_id, table_name, table_route_value)?
+            .into_iter()
+            .map(|key| TxnOp::Delete(key.into_bytes()))
+            .collect::<Vec<_>>();

-        // It's always successes.
-        let _ = self.kv_backend.txn(txn).await?;
+        let txn = Txn::new().and_then(operations);
+        let resp = self.kv_backend.txn(txn).await?;
+        ensure!(
+            resp.succeeded,
+            UnexpectedSnafu {
+                err_msg: format!("Failed to destroy table metadata: {table_id}")
+            }
+        );

        Ok(())
    }
@@ -595,7 +686,7 @@ impl TableMetadataManager {
    /// and the new `TableNameKey` MUST be empty.
    pub async fn rename_table(
        &self,
-        current_table_info_value: DeserializedValueWithBytes<TableInfoValue>,
+        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
        new_table_name: String,
    ) -> Result<()> {
        let current_table_info = &current_table_info_value.table_info;
@@ -629,7 +720,7 @@ impl TableMetadataManager {
        // Updates table info.
        let (update_table_info_txn, on_update_table_info_failure) = self
            .table_info_manager()
-            .build_update_txn(table_id, &current_table_info_value, &new_table_info_value)?;
+            .build_update_txn(table_id, current_table_info_value, &new_table_info_value)?;

        let txn = Txn::merge_all(vec![update_table_name_txn, update_table_info_txn]);

@@ -653,7 +744,7 @@ impl TableMetadataManager {
    /// Updates table info and returns an error if different metadata exists.
    pub async fn update_table_info(
        &self,
-        current_table_info_value: DeserializedValueWithBytes<TableInfoValue>,
+        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
        new_table_info: RawTableInfo,
    ) -> Result<()> {
        let table_id = current_table_info_value.table_info.ident.table_id;
@@ -663,7 +754,7 @@ impl TableMetadataManager {
        // Updates table info.
        let (update_table_info_txn, on_update_table_info_failure) = self
            .table_info_manager()
-            .build_update_txn(table_id, &current_table_info_value, &new_table_info_value)?;
+            .build_update_txn(table_id, current_table_info_value, &new_table_info_value)?;

        let r = self.kv_backend.txn(update_table_info_txn).await?;

@@ -687,7 +778,7 @@ impl TableMetadataManager {

    pub async fn batch_update_table_info_values(
        &self,
-        table_info_value_pairs: Vec<(TableInfoValue, RawTableInfo)>,
+        table_info_value_pairs: Vec<(DeserializedValueWithBytes<TableInfoValue>, RawTableInfo)>,
    ) -> Result<()> {
        let len = table_info_value_pairs.len();
        let mut txns = Vec::with_capacity(len);
@@ -708,7 +799,7 @@ impl TableMetadataManager {
            let (update_table_info_txn, on_update_table_info_failure) =
                self.table_info_manager().build_update_txn(
                    table_id,
-                    &DeserializedValueWithBytes::from_inner(table_info_value),
+                    &table_info_value,
                    &new_table_info_value,
                )?;

@@ -873,6 +964,38 @@ macro_rules! impl_table_meta_value {
    }
 }

+macro_rules! impl_table_meta_key_get_txn_op {
+    ($($key: ty), *) => {
+        $(
+            impl $crate::key::TableMetaKeyGetTxnOp for $key {
+                /// Returns a [TxnOp] to retrieve the corresponding value
+                /// and a filter to retrieve the value from the [TxnOpGetResponseSet]
+                fn build_get_op(
+                    &self,
+                ) -> (
+                    TxnOp,
+                    impl for<'a> FnMut(
+                        &'a mut TxnOpGetResponseSet,
+                    ) -> Option<Vec<u8>>,
+                ) {
+                    let raw_key = self.as_raw_key();
+                    (
+                        TxnOp::Get(raw_key.clone()),
+                        TxnOpGetResponseSet::filter(raw_key),
+                    )
+                }
+            }
+        )*
+    }
+}
+
+impl_table_meta_key_get_txn_op! {
+    TableNameKey<'_>,
+    TableInfoKey,
+    TableRouteKey,
+    DatanodeTableKey
+}
+
 #[macro_export]
 macro_rules! impl_optional_meta_value {
    ($($val_ty: ty), *) => {
@@ -907,6 +1030,7 @@ mod tests {
    use std::sync::Arc;

    use bytes::Bytes;
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
    use common_time::util::current_time_millis;
    use futures::TryStreamExt;
    use store_api::storage::RegionId;
@@ -914,6 +1038,7 @@ mod tests {

    use super::datanode_table::DatanodeTableKey;
    use super::test_utils;
+    use crate::ddl::test_util::create_table::test_create_table_task;
    use crate::ddl::utils::region_storage_path;
    use crate::error::Result;
    use crate::key::datanode_table::RegionInfo;
@@ -1155,15 +1280,10 @@ mod tests {
            table_info.schema_name,
            table_info.name,
        );
+        let table_route_value = &TableRouteValue::physical(region_routes.clone());
        // deletes metadata.
        table_metadata_manager
-            .delete_table_metadata(table_id, &table_name, region_routes)
-            .await
-            .unwrap();
-
-        // if metadata was already deleted, it should be ok.
-        table_metadata_manager
-            .delete_table_metadata(table_id, &table_name, region_routes)
+            .delete_table_metadata(table_id, &table_name, table_route_value)
            .await
            .unwrap();

@@ -1229,12 +1349,12 @@ mod tests {
            DeserializedValueWithBytes::from_inner(TableInfoValue::new(table_info.clone()));

        table_metadata_manager
-            .rename_table(table_info_value.clone(), new_table_name.clone())
+            .rename_table(&table_info_value, new_table_name.clone())
            .await
            .unwrap();
        // if remote metadata was updated, it should be ok.
        table_metadata_manager
-            .rename_table(table_info_value.clone(), new_table_name.clone())
+            .rename_table(&table_info_value, new_table_name.clone())
            .await
            .unwrap();
        let mut modified_table_info = table_info.clone();
@@ -1244,7 +1364,7 @@ mod tests {
        // if the table_info_value is wrong, it should return an error.
        // The ABA problem.
        assert!(table_metadata_manager
-            .rename_table(modified_table_info_value.clone(), new_table_name.clone())
+            .rename_table(&modified_table_info_value, new_table_name.clone())
            .await
            .is_err());

@@ -1302,12 +1422,12 @@ mod tests {
            DeserializedValueWithBytes::from_inner(TableInfoValue::new(table_info.clone()));
        // should be ok.
        table_metadata_manager
-            .update_table_info(current_table_info_value.clone(), new_table_info.clone())
+            .update_table_info(&current_table_info_value, new_table_info.clone())
            .await
            .unwrap();
        // if table info was updated, it should be ok.
        table_metadata_manager
-            .update_table_info(current_table_info_value.clone(), new_table_info.clone())
+            .update_table_info(&current_table_info_value, new_table_info.clone())
            .await
            .unwrap();

@@ -1329,7 +1449,7 @@ mod tests {
        // if the current_table_info_value is wrong, it should return an error.
        // The ABA problem.
        assert!(table_metadata_manager
-            .update_table_info(wrong_table_info_value, new_table_info)
+            .update_table_info(&wrong_table_info_value, new_table_info)
            .await
            .is_err())
    }
@@ -1559,4 +1679,118 @@ mod tests {
            .await
            .is_err());
    }
+
+    #[tokio::test]
+    async fn test_destroy_table_metadata() {
+        let mem_kv = Arc::new(MemoryKvBackend::default());
+        let table_metadata_manager = TableMetadataManager::new(mem_kv.clone());
+        let table_id = 1025;
+        let table_name = "foo";
+        let task = test_create_table_task(table_name, table_id);
+        let options = [(0, "test".to_string())].into();
+        table_metadata_manager
+            .create_table_metadata(
+                task.table_info,
+                TableRouteValue::physical(vec![
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(table_id, 1)),
+                        leader_peer: Some(Peer::empty(1)),
+                        follower_peers: vec![Peer::empty(5)],
+                        leader_status: None,
+                        leader_down_since: None,
+                    },
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(table_id, 2)),
+                        leader_peer: Some(Peer::empty(2)),
+                        follower_peers: vec![Peer::empty(4)],
+                        leader_status: None,
+                        leader_down_since: None,
+                    },
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(table_id, 3)),
+                        leader_peer: Some(Peer::empty(3)),
+                        follower_peers: vec![],
+                        leader_status: None,
+                        leader_down_since: None,
+                    },
+                ]),
+                options,
+            )
+            .await
+            .unwrap();
+        let table_name = TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name);
+        let table_route_value = table_metadata_manager
+            .table_route_manager
+            .table_route_storage()
+            .get_raw(table_id)
+            .await
+            .unwrap()
+            .unwrap();
+        table_metadata_manager
+            .destroy_table_metadata(table_id, &table_name, &table_route_value)
+            .await
+            .unwrap();
+        assert!(mem_kv.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_restore_table_metadata() {
+        let mem_kv = Arc::new(MemoryKvBackend::default());
+        let table_metadata_manager = TableMetadataManager::new(mem_kv.clone());
+        let table_id = 1025;
+        let table_name = "foo";
+        let task = test_create_table_task(table_name, table_id);
+        let options = [(0, "test".to_string())].into();
+        table_metadata_manager
+            .create_table_metadata(
+                task.table_info,
+                TableRouteValue::physical(vec![
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(table_id, 1)),
+                        leader_peer: Some(Peer::empty(1)),
+                        follower_peers: vec![Peer::empty(5)],
+                        leader_status: None,
+                        leader_down_since: None,
+                    },
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(table_id, 2)),
+                        leader_peer: Some(Peer::empty(2)),
+                        follower_peers: vec![Peer::empty(4)],
+                        leader_status: None,
+                        leader_down_since: None,
+                    },
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(table_id, 3)),
+                        leader_peer: Some(Peer::empty(3)),
+                        follower_peers: vec![],
+                        leader_status: None,
+                        leader_down_since: None,
+                    },
+                ]),
+                options,
+            )
+            .await
+            .unwrap();
+        let expected_result = mem_kv.dump();
+        let table_route_value = table_metadata_manager
+            .table_route_manager
+            .table_route_storage()
+            .get_raw(table_id)
+            .await
+            .unwrap()
+            .unwrap();
+        let region_routes = table_route_value.region_routes().unwrap();
+        let table_name = TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name);
+        let table_route_value = TableRouteValue::physical(region_routes.clone());
+        table_metadata_manager
+            .delete_table_metadata(table_id, &table_name, &table_route_value)
+            .await
+            .unwrap();
+        table_metadata_manager
+            .restore_table_metadata(table_id, &table_name, &table_route_value)
+            .await
+            .unwrap();
+        let kvs = mem_kv.dump();
+        assert_eq!(kvs, expected_result);
+    }
 }
--- a/src/common/meta/src/key/datanode_table.rs
+++ b/src/common/meta/src/key/datanode_table.rs
@@ -55,6 +55,7 @@ pub struct RegionInfo {
    pub region_wal_options: HashMap<RegionNumber, String>,
 }

+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
 pub struct DatanodeTableKey {
    pub datanode_id: DatanodeId,
    pub table_id: TableId,
--- a/src/common/meta/src/key/table_info.rs
+++ b/src/common/meta/src/key/table_info.rs
@@ -18,10 +18,11 @@ use serde::{Deserialize, Serialize};
 use table::metadata::{RawTableInfo, TableId};
 use table::table_reference::TableReference;

-use super::{txn_helper, DeserializedValueWithBytes, TableMetaValue, TABLE_INFO_KEY_PREFIX};
 use crate::error::Result;
-use crate::key::TableMetaKey;
-use crate::kv_backend::txn::{Txn, TxnOp, TxnOpResponse};
+use crate::key::{
+    txn_helper, DeserializedValueWithBytes, TableMetaKey, TableMetaValue, TABLE_INFO_KEY_PREFIX,
+};
+use crate::kv_backend::txn::{Txn, TxnOpResponse};
 use crate::kv_backend::KvBackendRef;
 use crate::rpc::store::BatchGetRequest;
 use crate::table_name::TableName;
@@ -101,20 +102,6 @@ impl TableInfoManager {
        Self { kv_backend }
    }

-    pub(crate) fn build_get_txn(
-        &self,
-        table_id: TableId,
-    ) -> (
-        Txn,
-        impl FnOnce(&Vec<TxnOpResponse>) -> Result<Option<DeserializedValueWithBytes<TableInfoValue>>>,
-    ) {
-        let key = TableInfoKey::new(table_id);
-        let raw_key = key.as_raw_key();
-        let txn = Txn::new().and_then(vec![TxnOp::Get(raw_key.clone())]);
-
-        (txn, txn_helper::build_txn_response_decoder_fn(raw_key))
-    }
-
    /// Builds a create table info transaction, it expected the `__table_info/{table_id}` wasn't occupied.
    pub(crate) fn build_create_txn(
        &self,
@@ -156,16 +143,6 @@ impl TableInfoManager {
        Ok((txn, txn_helper::build_txn_response_decoder_fn(raw_key)))
    }

-    /// Builds a delete table info transaction.
-    pub(crate) fn build_delete_txn(&self, table_id: TableId) -> Result<Txn> {
-        let key = TableInfoKey::new(table_id);
-        let raw_key = key.as_raw_key();
-
-        let txn = Txn::new().and_then(vec![TxnOp::Delete(raw_key)]);
-
-        Ok(txn)
-    }
-
    pub async fn get(
        &self,
        table_id: TableId,
@@ -209,6 +186,38 @@ impl TableInfoManager {

        Ok(values)
    }
+
+    /// Returns batch of `DeserializedValueWithBytes<TableInfoValue>`.
+    pub async fn batch_get_raw(
+        &self,
+        table_ids: &[TableId],
+    ) -> Result<HashMap<TableId, DeserializedValueWithBytes<TableInfoValue>>> {
+        let lookup_table = table_ids
+            .iter()
+            .map(|id| (TableInfoKey::new(*id).as_raw_key(), id))
+            .collect::<HashMap<_, _>>();
+
+        let resp = self
+            .kv_backend
+            .batch_get(BatchGetRequest {
+                keys: lookup_table.keys().cloned().collect::<Vec<_>>(),
+            })
+            .await?;
+
+        let values = resp
+            .kvs
+            .iter()
+            .map(|kv| {
+                Ok((
+                    // Safety: must exist.
+                    **lookup_table.get(kv.key()).unwrap(),
+                    DeserializedValueWithBytes::from_inner_slice(&kv.value)?,
+                ))
+            })
+            .collect::<Result<HashMap<_, _>>>()?;
+
+        Ok(values)
+    }
 }

 #[cfg(test)]
--- a/src/common/meta/src/key/table_name.rs
+++ b/src/common/meta/src/key/table_name.rs
@@ -194,14 +194,6 @@ impl TableNameManager {
        Ok(txn)
    }

-    /// Builds a delete table name transaction. It only executes while the primary keys comparing successes.
-    pub(crate) fn build_delete_txn(&self, key: &TableNameKey<'_>) -> Result<Txn> {
-        let raw_key = key.as_raw_key();
-        let txn = Txn::new().and_then(vec![TxnOp::Delete(raw_key)]);
-
-        Ok(txn)
-    }
-
    pub async fn get(&self, key: TableNameKey<'_>) -> Result<Option<TableNameValue>> {
        let raw_key = key.as_raw_key();
        self.kv_backend
--- a/src/common/meta/src/key/table_route.rs
+++ b/src/common/meta/src/key/table_route.rs
@@ -26,7 +26,7 @@ use crate::error::{
    UnexpectedLogicalRouteTableSnafu,
 };
 use crate::key::{RegionDistribution, TableMetaKey, TABLE_ROUTE_PREFIX};
-use crate::kv_backend::txn::{Txn, TxnOp, TxnOpResponse};
+use crate::kv_backend::txn::{Txn, TxnOpResponse};
 use crate::kv_backend::KvBackendRef;
 use crate::rpc::router::{region_distribution, RegionRoute};
 use crate::rpc::store::BatchGetRequest;
@@ -61,6 +61,27 @@ pub struct LogicalTableRouteValue {
 }

 impl TableRouteValue {
+    /// Returns a [TableRouteValue::Physical] if `table_id` equals `physical_table_id`.
+    /// Otherwise returns a [TableRouteValue::Logical].
+    pub(crate) fn new(
+        table_id: TableId,
+        physical_table_id: TableId,
+        region_routes: Vec<RegionRoute>,
+    ) -> Self {
+        if table_id == physical_table_id {
+            TableRouteValue::physical(region_routes)
+        } else {
+            let region_routes = region_routes
+                .into_iter()
+                .map(|region| {
+                    debug_assert_eq!(region.region.id.table_id(), physical_table_id);
+                    RegionId::new(table_id, region.region.id.region_number())
+                })
+                .collect::<Vec<_>>();
+            TableRouteValue::logical(physical_table_id, region_routes)
+        }
+    }
+
    pub fn physical(region_routes: Vec<RegionRoute>) -> Self {
        Self::Physical(PhysicalTableRouteValue::new(region_routes))
    }
@@ -425,21 +446,6 @@ impl TableRouteStorage {
        Self { kv_backend }
    }

-    /// Builds a get table route transaction(readonly).
-    pub(crate) fn build_get_txn(
-        &self,
-        table_id: TableId,
-    ) -> (
-        Txn,
-        impl FnOnce(&Vec<TxnOpResponse>) -> Result<Option<DeserializedValueWithBytes<TableRouteValue>>>,
-    ) {
-        let key = TableRouteKey::new(table_id);
-        let raw_key = key.as_raw_key();
-        let txn = Txn::new().and_then(vec![TxnOp::Get(raw_key.clone())]);
-
-        (txn, txn_helper::build_txn_response_decoder_fn(raw_key))
-    }
-
    /// Builds a create table route transaction,
    /// it expected the `__table_route/{table_id}` wasn't occupied.
    pub fn build_create_txn(
@@ -483,17 +489,6 @@ impl TableRouteStorage {
        Ok((txn, txn_helper::build_txn_response_decoder_fn(raw_key)))
    }

-    /// Builds a delete table route transaction,
-    /// it expected the remote value equals the `table_route_value`.
-    pub(crate) fn build_delete_txn(&self, table_id: TableId) -> Result<Txn> {
-        let key = TableRouteKey::new(table_id);
-        let raw_key = key.as_raw_key();
-
-        let txn = Txn::new().and_then(vec![TxnOp::Delete(raw_key)]);
-
-        Ok(txn)
-    }
-
    /// Returns the [`TableRouteValue`].
    pub async fn get(&self, table_id: TableId) -> Result<Option<TableRouteValue>> {
        let key = TableRouteKey::new(table_id);
@@ -517,6 +512,37 @@ impl TableRouteStorage {
            .transpose()
    }

+    /// Returns the physical `DeserializedValueWithBytes<TableRouteValue>` recursively.
+    ///
+    /// Returns a [TableRouteNotFound](crate::error::Error::TableRouteNotFound) Error if:
+    /// - the physical table(`logical_or_physical_table_id`) does not exist
+    /// - the corresponding physical table of the logical table(`logical_or_physical_table_id`) does not exist.
+    pub async fn get_raw_physical_table_route(
+        &self,
+        logical_or_physical_table_id: TableId,
+    ) -> Result<(TableId, DeserializedValueWithBytes<TableRouteValue>)> {
+        let table_route =
+            self.get_raw(logical_or_physical_table_id)
+                .await?
+                .context(TableRouteNotFoundSnafu {
+                    table_id: logical_or_physical_table_id,
+                })?;
+
+        match table_route.get_inner_ref() {
+            TableRouteValue::Physical(_) => Ok((logical_or_physical_table_id, table_route)),
+            TableRouteValue::Logical(x) => {
+                let physical_table_id = x.physical_table_id();
+                let physical_table_route =
+                    self.get_raw(physical_table_id)
+                        .await?
+                        .context(TableRouteNotFoundSnafu {
+                            table_id: physical_table_id,
+                        })?;
+                Ok((physical_table_id, physical_table_route))
+            }
+        }
+    }
+
    /// Returns batch of [`TableRouteValue`] that respects the order of `table_ids`.
    pub async fn batch_get(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
        let keys = table_ids
--- a/src/common/meta/src/key/tombstone.rs
+++ b/src/common/meta/src/key/tombstone.rs
@@ -0,0 +1,544 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use snafu::{ensure, OptionExt};
+
+use super::TableMetaKeyGetTxnOp;
+use crate::error::{self, Result};
+use crate::key::txn_helper::TxnOpGetResponseSet;
+use crate::kv_backend::txn::{Compare, CompareOp, Txn, TxnOp};
+use crate::kv_backend::KvBackendRef;
+
+/// [TombstoneManager] provides the ability to:
+/// - logically delete values
+/// - restore the deleted values
+pub(crate) struct TombstoneManager {
+    kv_backend: KvBackendRef,
+}
+
+const TOMBSTONE_PREFIX: &str = "__tombstone/";
+
+pub(crate) struct TombstoneKey<T>(T);
+
+fn to_tombstone(key: &[u8]) -> Vec<u8> {
+    [TOMBSTONE_PREFIX.as_bytes(), key].concat()
+}
+
+impl TombstoneKey<&Vec<u8>> {
+    /// Returns the origin key and tombstone key.
+    fn to_keys(&self) -> (Vec<u8>, Vec<u8>) {
+        let key = self.0;
+        let tombstone_key = to_tombstone(key);
+        (key.clone(), tombstone_key)
+    }
+
+    /// Returns the origin key and tombstone key.
+    fn into_keys(self) -> (Vec<u8>, Vec<u8>) {
+        self.to_keys()
+    }
+
+    /// Returns the tombstone key.
+    fn to_tombstone_key(&self) -> Vec<u8> {
+        let key = self.0;
+        to_tombstone(key)
+    }
+}
+
+impl TableMetaKeyGetTxnOp for TombstoneKey<&Vec<u8>> {
+    fn build_get_op(
+        &self,
+    ) -> (
+        TxnOp,
+        impl FnMut(&'_ mut TxnOpGetResponseSet) -> Option<Vec<u8>>,
+    ) {
+        TxnOpGetResponseSet::build_get_op(to_tombstone(self.0))
+    }
+}
+
+/// The key used in the [TombstoneManager].
+pub(crate) struct Key {
+    bytes: Vec<u8>,
+    // Atomic Key:
+    // The value corresponding to the key remains consistent between two transactions.
+    atomic: bool,
+}
+
+impl Key {
+    /// Returns a new atomic key.
+    pub(crate) fn compare_and_swap<T: Into<Vec<u8>>>(key: T) -> Self {
+        Self {
+            bytes: key.into(),
+            atomic: true,
+        }
+    }
+
+    /// Returns a new normal key.
+    pub(crate) fn new<T: Into<Vec<u8>>>(key: T) -> Self {
+        Self {
+            bytes: key.into(),
+            atomic: false,
+        }
+    }
+
+    /// Into bytes
+    pub(crate) fn into_bytes(self) -> Vec<u8> {
+        self.bytes
+    }
+
+    fn get_inner(&self) -> &Vec<u8> {
+        &self.bytes
+    }
+
+    fn is_atomic(&self) -> bool {
+        self.atomic
+    }
+}
+
+impl TableMetaKeyGetTxnOp for Key {
+    fn build_get_op(
+        &self,
+    ) -> (
+        TxnOp,
+        impl FnMut(&'_ mut TxnOpGetResponseSet) -> Option<Vec<u8>>,
+    ) {
+        let key = self.get_inner().clone();
+        (TxnOp::Get(key.clone()), TxnOpGetResponseSet::filter(key))
+    }
+}
+
+fn format_on_failure_error_message<F: FnMut(&mut TxnOpGetResponseSet) -> Option<Vec<u8>>>(
+    mut set: TxnOpGetResponseSet,
+    on_failure_kv_and_filters: Vec<(Vec<u8>, Vec<u8>, F)>,
+) -> String {
+    on_failure_kv_and_filters
+        .into_iter()
+        .flat_map(|(key, value, mut filter)| {
+            let got = filter(&mut set);
+            let Some(got) = got else {
+                return Some(format!(
+                    "For key: {} was expected: {}, but value does not exists",
+                    String::from_utf8_lossy(&key),
+                    String::from_utf8_lossy(&value),
+                ));
+            };
+
+            if got != value {
+                Some(format!(
+                    "For key: {} was expected: {}, but got: {}",
+                    String::from_utf8_lossy(&key),
+                    String::from_utf8_lossy(&value),
+                    String::from_utf8_lossy(&got),
+                ))
+            } else {
+                None
+            }
+        })
+        .collect::<Vec<_>>()
+        .join("; ")
+}
+
+fn format_keys(keys: &[Key]) -> String {
+    keys.iter()
+        .map(|key| String::from_utf8_lossy(&key.bytes))
+        .collect::<Vec<_>>()
+        .join(", ")
+}
+
+impl TombstoneManager {
+    /// Returns [TombstoneManager].
+    pub fn new(kv_backend: KvBackendRef) -> Self {
+        Self { kv_backend }
+    }
+
+    /// Creates tombstones for keys.
+    ///
+    /// Preforms to:
+    /// - retrieve all values corresponding `keys`.
+    /// - stores tombstone values.
+    pub(crate) async fn create(&self, keys: Vec<Key>) -> Result<()> {
+        // Builds transaction to retrieve all values
+        let (operations, mut filters): (Vec<_>, Vec<_>) =
+            keys.iter().map(|key| key.build_get_op()).unzip();
+
+        let txn = Txn::new().and_then(operations);
+        let mut resp = self.kv_backend.txn(txn).await?;
+        ensure!(
+            resp.succeeded,
+            error::UnexpectedSnafu {
+                err_msg: format!(
+                    "Failed to retrieves the metadata, keys: {}",
+                    format_keys(&keys)
+                ),
+            }
+        );
+
+        let mut set = TxnOpGetResponseSet::from(&mut resp.responses);
+        // Builds the create tombstone transaction.
+        let mut tombstone_operations = Vec::with_capacity(keys.len() * 2);
+        let mut tombstone_comparison = vec![];
+        let mut on_failure_operations = vec![];
+        let mut on_failure_kv_and_filters = vec![];
+        for (idx, key) in keys.iter().enumerate() {
+            let filter = &mut filters[idx];
+            let value = filter(&mut set).with_context(|| error::UnexpectedSnafu {
+                err_msg: format!(
+                    "Missing value, key: {}",
+                    String::from_utf8_lossy(key.get_inner())
+                ),
+            })?;
+            let (origin_key, tombstone_key) = TombstoneKey(key.get_inner()).into_keys();
+            // Compares the atomic key.
+            if key.is_atomic() {
+                tombstone_comparison.push(Compare::with_not_exist_value(
+                    tombstone_key.clone(),
+                    CompareOp::Equal,
+                ));
+                tombstone_comparison.push(Compare::with_value(
+                    origin_key.clone(),
+                    CompareOp::Equal,
+                    value.clone(),
+                ));
+                let (op, filter) = TxnOpGetResponseSet::build_get_op(origin_key.clone());
+                on_failure_operations.push(op);
+                on_failure_kv_and_filters.push((origin_key.clone(), value.clone(), filter));
+            }
+            tombstone_operations.push(TxnOp::Delete(origin_key));
+            tombstone_operations.push(TxnOp::Put(tombstone_key, value));
+        }
+
+        let txn = if !tombstone_comparison.is_empty() {
+            Txn::new().when(tombstone_comparison)
+        } else {
+            Txn::new()
+        }
+        .and_then(tombstone_operations);
+
+        let txn = if !on_failure_operations.is_empty() {
+            txn.or_else(on_failure_operations)
+        } else {
+            txn
+        };
+
+        let mut resp = self.kv_backend.txn(txn).await?;
+        // TODO(weny): add tests for atomic key changed.
+        if !resp.succeeded {
+            let set = TxnOpGetResponseSet::from(&mut resp.responses);
+            let err_msg = format_on_failure_error_message(set, on_failure_kv_and_filters);
+            return error::CasKeyChangedSnafu { err_msg }.fail();
+        }
+        Ok(())
+    }
+
+    /// Restores tombstones for keys.
+    ///
+    /// Preforms to:
+    /// - retrieve all tombstone values corresponding `keys`.
+    /// - stores tombstone values.
+    pub(crate) async fn restore(&self, keys: Vec<Key>) -> Result<()> {
+        // Builds transaction to retrieve all tombstone values
+        let tombstone_keys = keys
+            .iter()
+            .map(|key| TombstoneKey(key.get_inner()))
+            .collect::<Vec<_>>();
+        let (operations, mut filters): (Vec<_>, Vec<_>) =
+            tombstone_keys.iter().map(|key| key.build_get_op()).unzip();
+
+        let txn = Txn::new().and_then(operations);
+        let mut resp = self.kv_backend.txn(txn).await?;
+        ensure!(
+            resp.succeeded,
+            error::UnexpectedSnafu {
+                err_msg: format!(
+                    "Failed to retrieves the metadata, keys: {}",
+                    format_keys(&keys)
+                ),
+            }
+        );
+
+        let mut set = TxnOpGetResponseSet::from(&mut resp.responses);
+
+        // Builds the restore tombstone transaction.
+        let mut tombstone_operations = Vec::with_capacity(keys.len() * 2);
+        let mut tombstone_comparison = vec![];
+        let mut on_failure_operations = vec![];
+        let mut on_failure_kv_and_filters = vec![];
+        for (idx, key) in keys.iter().enumerate() {
+            let filter = &mut filters[idx];
+            let value = filter(&mut set).with_context(|| error::UnexpectedSnafu {
+                err_msg: format!(
+                    "Missing value, key: {}",
+                    String::from_utf8_lossy(key.get_inner())
+                ),
+            })?;
+            let (origin_key, tombstone_key) = tombstone_keys[idx].to_keys();
+            // Compares the atomic key.
+            if key.is_atomic() {
+                tombstone_comparison.push(Compare::with_not_exist_value(
+                    origin_key.clone(),
+                    CompareOp::Equal,
+                ));
+                tombstone_comparison.push(Compare::with_value(
+                    tombstone_key.clone(),
+                    CompareOp::Equal,
+                    value.clone(),
+                ));
+                let (op, filter) = tombstone_keys[idx].build_get_op();
+                on_failure_operations.push(op);
+                on_failure_kv_and_filters.push((tombstone_key.clone(), value.clone(), filter));
+            }
+            tombstone_operations.push(TxnOp::Delete(tombstone_key));
+            tombstone_operations.push(TxnOp::Put(origin_key, value));
+        }
+
+        let txn = if !tombstone_comparison.is_empty() {
+            Txn::new().when(tombstone_comparison)
+        } else {
+            Txn::new()
+        }
+        .and_then(tombstone_operations);
+
+        let txn = if !on_failure_operations.is_empty() {
+            txn.or_else(on_failure_operations)
+        } else {
+            txn
+        };
+
+        let mut resp = self.kv_backend.txn(txn).await?;
+        // TODO(weny): add tests for atomic key changed.
+        if !resp.succeeded {
+            let set = TxnOpGetResponseSet::from(&mut resp.responses);
+            let err_msg = format_on_failure_error_message(set, on_failure_kv_and_filters);
+            return error::CasKeyChangedSnafu { err_msg }.fail();
+        }
+
+        Ok(())
+    }
+
+    /// Deletes tombstones for keys.
+    pub(crate) async fn delete(&self, keys: Vec<Vec<u8>>) -> Result<()> {
+        let operations = keys
+            .iter()
+            .map(|key| TxnOp::Delete(TombstoneKey(key).to_tombstone_key()))
+            .collect::<Vec<_>>();
+
+        let txn = Txn::new().and_then(operations);
+        // Always success.
+        let _ = self.kv_backend.txn(txn).await?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use std::sync::Arc;
+
+    use crate::key::tombstone::{Key, TombstoneKey, TombstoneManager};
+    use crate::kv_backend::memory::MemoryKvBackend;
+    use crate::kv_backend::KvBackend;
+    use crate::rpc::store::PutRequest;
+
+    #[tokio::test]
+    async fn test_create_tombstone() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let tombstone_manager = TombstoneManager::new(kv_backend.clone());
+        kv_backend
+            .put(PutRequest::new().with_key("bar").with_value("baz"))
+            .await
+            .unwrap();
+        kv_backend
+            .put(PutRequest::new().with_key("foo").with_value("hi"))
+            .await
+            .unwrap();
+        tombstone_manager
+            .create(vec![Key::compare_and_swap("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        assert!(!kv_backend.exists(b"bar").await.unwrap());
+        assert!(!kv_backend.exists(b"foo").await.unwrap());
+        assert_eq!(
+            kv_backend
+                .get(&TombstoneKey(&"bar".into()).to_tombstone_key())
+                .await
+                .unwrap()
+                .unwrap()
+                .value,
+            b"baz"
+        );
+        assert_eq!(
+            kv_backend
+                .get(&TombstoneKey(&"foo".into()).to_tombstone_key())
+                .await
+                .unwrap()
+                .unwrap()
+                .value,
+            b"hi"
+        );
+        assert_eq!(kv_backend.len(), 2);
+    }
+
+    #[tokio::test]
+    async fn test_create_tombstone_without_atomic_key() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let tombstone_manager = TombstoneManager::new(kv_backend.clone());
+        kv_backend
+            .put(PutRequest::new().with_key("bar").with_value("baz"))
+            .await
+            .unwrap();
+        kv_backend
+            .put(PutRequest::new().with_key("foo").with_value("hi"))
+            .await
+            .unwrap();
+        tombstone_manager
+            .create(vec![Key::new("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        assert!(!kv_backend.exists(b"bar").await.unwrap());
+        assert!(!kv_backend.exists(b"foo").await.unwrap());
+        assert_eq!(
+            kv_backend
+                .get(&TombstoneKey(&"bar".into()).to_tombstone_key())
+                .await
+                .unwrap()
+                .unwrap()
+                .value,
+            b"baz"
+        );
+        assert_eq!(
+            kv_backend
+                .get(&TombstoneKey(&"foo".into()).to_tombstone_key())
+                .await
+                .unwrap()
+                .unwrap()
+                .value,
+            b"hi"
+        );
+        assert_eq!(kv_backend.len(), 2);
+    }
+
+    #[tokio::test]
+    async fn test_create_tombstone_origin_value_not_found_err() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let tombstone_manager = TombstoneManager::new(kv_backend.clone());
+
+        kv_backend
+            .put(PutRequest::new().with_key("bar").with_value("baz"))
+            .await
+            .unwrap();
+        kv_backend
+            .put(PutRequest::new().with_key("foo").with_value("hi"))
+            .await
+            .unwrap();
+
+        let err = tombstone_manager
+            .create(vec![Key::compare_and_swap("bar"), Key::new("baz")])
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("Missing value"));
+    }
+
+    #[tokio::test]
+    async fn test_restore_tombstone() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let tombstone_manager = TombstoneManager::new(kv_backend.clone());
+        kv_backend
+            .put(PutRequest::new().with_key("bar").with_value("baz"))
+            .await
+            .unwrap();
+        kv_backend
+            .put(PutRequest::new().with_key("foo").with_value("hi"))
+            .await
+            .unwrap();
+        let expected_kvs = kv_backend.dump();
+        tombstone_manager
+            .create(vec![Key::compare_and_swap("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        tombstone_manager
+            .restore(vec![Key::compare_and_swap("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        assert_eq!(expected_kvs, kv_backend.dump());
+    }
+
+    #[tokio::test]
+    async fn test_restore_tombstone_without_atomic_key() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let tombstone_manager = TombstoneManager::new(kv_backend.clone());
+        kv_backend
+            .put(PutRequest::new().with_key("bar").with_value("baz"))
+            .await
+            .unwrap();
+        kv_backend
+            .put(PutRequest::new().with_key("foo").with_value("hi"))
+            .await
+            .unwrap();
+        let expected_kvs = kv_backend.dump();
+        tombstone_manager
+            .create(vec![Key::compare_and_swap("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        tombstone_manager
+            .restore(vec![Key::new("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        assert_eq!(expected_kvs, kv_backend.dump());
+    }
+
+    #[tokio::test]
+    async fn test_restore_tombstone_origin_value_not_found_err() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let tombstone_manager = TombstoneManager::new(kv_backend.clone());
+        kv_backend
+            .put(PutRequest::new().with_key("bar").with_value("baz"))
+            .await
+            .unwrap();
+        kv_backend
+            .put(PutRequest::new().with_key("foo").with_value("hi"))
+            .await
+            .unwrap();
+        tombstone_manager
+            .create(vec![Key::compare_and_swap("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        let err = tombstone_manager
+            .restore(vec![Key::new("bar"), Key::new("baz")])
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("Missing value"));
+    }
+
+    #[tokio::test]
+    async fn test_delete_tombstone() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let tombstone_manager = TombstoneManager::new(kv_backend.clone());
+        kv_backend
+            .put(PutRequest::new().with_key("bar").with_value("baz"))
+            .await
+            .unwrap();
+        kv_backend
+            .put(PutRequest::new().with_key("foo").with_value("hi"))
+            .await
+            .unwrap();
+        tombstone_manager
+            .create(vec![Key::compare_and_swap("bar"), Key::new("foo")])
+            .await
+            .unwrap();
+        tombstone_manager
+            .delete(vec![b"bar".to_vec(), b"foo".to_vec()])
+            .await
+            .unwrap();
+        assert!(kv_backend.is_empty());
+    }
+}
--- a/src/common/meta/src/key/txn_helper.rs
+++ b/src/common/meta/src/key/txn_helper.rs
@@ -18,7 +18,69 @@ use serde::Serialize;
 use crate::error::Result;
 use crate::key::{DeserializedValueWithBytes, TableMetaValue};
 use crate::kv_backend::txn::{Compare, CompareOp, Txn, TxnOp, TxnOpResponse};
+use crate::rpc::KeyValue;

+/// The response set of [TxnOpResponse::ResponseGet]
+pub(crate) struct TxnOpGetResponseSet(Vec<KeyValue>);
+
+impl TxnOpGetResponseSet {
+    /// Returns a [TxnOp] to retrieve the value corresponding `key` and
+    /// a filter to consume corresponding [KeyValue] from [TxnOpGetResponseSet].
+    pub(crate) fn build_get_op<T: Into<Vec<u8>>>(
+        key: T,
+    ) -> (
+        TxnOp,
+        impl FnMut(&'_ mut TxnOpGetResponseSet) -> Option<Vec<u8>>,
+    ) {
+        let key = key.into();
+        (TxnOp::Get(key.clone()), TxnOpGetResponseSet::filter(key))
+    }
+
+    /// Returns a filter to consume a [KeyValue] where the key equals `key`.
+    pub(crate) fn filter(key: Vec<u8>) -> impl FnMut(&mut TxnOpGetResponseSet) -> Option<Vec<u8>> {
+        move |set| {
+            let pos = set.0.iter().position(|kv| kv.key == key);
+            match pos {
+                Some(pos) => Some(set.0.remove(pos).value),
+                None => None,
+            }
+        }
+    }
+
+    /// Returns a decoder to decode bytes to `DeserializedValueWithBytes<T>`.
+    pub(crate) fn decode_with<F, T>(
+        mut f: F,
+    ) -> impl FnMut(&mut TxnOpGetResponseSet) -> Result<Option<DeserializedValueWithBytes<T>>>
+    where
+        F: FnMut(&mut TxnOpGetResponseSet) -> Option<Vec<u8>>,
+        T: Serialize + DeserializeOwned + TableMetaValue,
+    {
+        move |set| {
+            f(set)
+                .map(|value| DeserializedValueWithBytes::from_inner_slice(&value))
+                .transpose()
+        }
+    }
+}
+
+impl From<&mut Vec<TxnOpResponse>> for TxnOpGetResponseSet {
+    fn from(value: &mut Vec<TxnOpResponse>) -> Self {
+        let value = value
+            .extract_if(|resp| matches!(resp, TxnOpResponse::ResponseGet(_)))
+            .flat_map(|resp| {
+                // Safety: checked
+                let TxnOpResponse::ResponseGet(r) = resp else {
+                    unreachable!()
+                };
+
+                r.kvs
+            })
+            .collect::<Vec<_>>();
+        TxnOpGetResponseSet(value)
+    }
+}
+
+// TODO(weny): using `TxnOpGetResponseSet`.
 pub(crate) fn build_txn_response_decoder_fn<T>(
    raw_key: Vec<u8>,
 ) -> impl FnOnce(&Vec<TxnOpResponse>) -> Result<Option<DeserializedValueWithBytes<T>>>
--- a/src/common/meta/src/kv_backend/etcd.rs
+++ b/src/common/meta/src/kv_backend/etcd.rs
@@ -626,4 +626,95 @@ mod tests {
        assert_eq!(b"test_key".to_vec(), delete.key);
        let _ = delete.options.unwrap();
    }
+
+    use crate::kv_backend::test::{
+        prepare_kv_with_prefix, test_kv_batch_delete_with_prefix, test_kv_batch_get_with_prefix,
+        test_kv_compare_and_put_with_prefix, test_kv_delete_range_with_prefix,
+        test_kv_put_with_prefix, test_kv_range_2_with_prefix, test_kv_range_with_prefix,
+        unprepare_kv,
+    };
+
+    async fn build_kv_backend() -> Option<EtcdStore> {
+        let endpoints = std::env::var("GT_ETCD_ENDPOINTS").unwrap_or_default();
+        if endpoints.is_empty() {
+            return None;
+        }
+
+        let endpoints = endpoints
+            .split(',')
+            .map(|s| s.to_string())
+            .collect::<Vec<String>>();
+
+        let client = Client::connect(endpoints, None)
+            .await
+            .expect("malformed endpoints");
+
+        Some(EtcdStore {
+            client,
+            max_txn_ops: 128,
+        })
+    }
+
+    #[tokio::test]
+    async fn test_put() {
+        if let Some(kv_backend) = build_kv_backend().await {
+            let prefix = b"put/";
+            prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+            test_kv_put_with_prefix(&kv_backend, prefix.to_vec()).await;
+            unprepare_kv(&kv_backend, prefix).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn test_range() {
+        if let Some(kv_backend) = build_kv_backend().await {
+            let prefix = b"range/";
+            prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+            test_kv_range_with_prefix(&kv_backend, prefix.to_vec()).await;
+            unprepare_kv(&kv_backend, prefix).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn test_range_2() {
+        if let Some(kv_backend) = build_kv_backend().await {
+            test_kv_range_2_with_prefix(kv_backend, b"range2/".to_vec()).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn test_batch_get() {
+        if let Some(kv_backend) = build_kv_backend().await {
+            let prefix = b"batchGet/";
+            prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+            test_kv_batch_get_with_prefix(&kv_backend, prefix.to_vec()).await;
+            unprepare_kv(&kv_backend, prefix).await;
+        }
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_compare_and_put() {
+        if let Some(kv_backend) = build_kv_backend().await {
+            let kv_backend = Arc::new(kv_backend);
+            test_kv_compare_and_put_with_prefix(kv_backend, b"compareAndPut/".to_vec()).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn test_delete_range() {
+        if let Some(kv_backend) = build_kv_backend().await {
+            let prefix = b"deleteRange/";
+            prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+            test_kv_delete_range_with_prefix(kv_backend, prefix.to_vec()).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn test_batch_delete() {
+        if let Some(kv_backend) = build_kv_backend().await {
+            let prefix = b"batchDelete/";
+            prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+            test_kv_batch_delete_with_prefix(kv_backend, prefix.to_vec()).await;
+        }
+    }
 }
--- a/src/common/meta/src/kv_backend/memory.rs
+++ b/src/common/meta/src/kv_backend/memory.rs
@@ -70,6 +70,25 @@ impl<T> MemoryKvBackend<T> {
        let mut kvs = self.kvs.write().unwrap();
        kvs.clear();
    }
+
+    #[cfg(test)]
+    /// Returns true if the `kvs` is empty.
+    pub fn is_empty(&self) -> bool {
+        self.kvs.read().unwrap().is_empty()
+    }
+
+    #[cfg(test)]
+    /// Returns the `kvs`.
+    pub fn dump(&self) -> BTreeMap<Vec<u8>, Vec<u8>> {
+        let kvs = self.kvs.read().unwrap();
+        kvs.clone()
+    }
+
+    #[cfg(test)]
+    /// Returns the length of `kvs`
+    pub fn len(&self) -> usize {
+        self.kvs.read().unwrap().len()
+    }
 }

 #[async_trait]
@@ -357,14 +376,14 @@ mod tests {
    async fn test_put() {
        let kv_backend = mock_mem_store_with_data().await;

-        test_kv_put(kv_backend).await;
+        test_kv_put(&kv_backend).await;
    }

    #[tokio::test]
    async fn test_range() {
        let kv_backend = mock_mem_store_with_data().await;

-        test_kv_range(kv_backend).await;
+        test_kv_range(&kv_backend).await;
    }

    #[tokio::test]
@@ -378,7 +397,7 @@ mod tests {
    async fn test_batch_get() {
        let kv_backend = mock_mem_store_with_data().await;

-        test_kv_batch_get(kv_backend).await;
+        test_kv_batch_get(&kv_backend).await;
    }

    #[tokio::test(flavor = "multi_thread")]
--- a/src/common/meta/src/kv_backend/test.rs
+++ b/src/common/meta/src/kv_backend/test.rs
@@ -21,25 +21,33 @@ use crate::rpc::store::{BatchGetRequest, PutRequest};
 use crate::rpc::KeyValue;
 use crate::util;

-pub fn mock_kvs() -> Vec<KeyValue> {
+pub fn mock_kvs(prefix: Vec<u8>) -> Vec<KeyValue> {
    vec![
        KeyValue {
-            key: b"key1".to_vec(),
+            key: [prefix.clone(), b"key1".to_vec()].concat(),
            value: b"val1".to_vec(),
        },
        KeyValue {
-            key: b"key2".to_vec(),
+            key: [prefix.clone(), b"key2".to_vec()].concat(),
            value: b"val2".to_vec(),
        },
        KeyValue {
-            key: b"key3".to_vec(),
+            key: [prefix.clone(), b"key3".to_vec()].concat(),
            value: b"val3".to_vec(),
        },
+        KeyValue {
+            key: [prefix.clone(), b"key11".to_vec()].concat(),
+            value: b"val11".to_vec(),
+        },
    ]
 }

 pub async fn prepare_kv(kv_backend: &impl KvBackend) {
-    let kvs = mock_kvs();
+    prepare_kv_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn prepare_kv_with_prefix(kv_backend: &impl KvBackend, prefix: Vec<u8>) {
+    let kvs = mock_kvs(prefix);
    assert!(kv_backend
        .batch_put(BatchPutRequest {
            kvs,
@@ -47,21 +55,29 @@ pub async fn prepare_kv(kv_backend: &impl KvBackend) {
        })
        .await
        .is_ok());
+}

+pub async fn unprepare_kv(kv_backend: &impl KvBackend, prefix: &[u8]) {
+    let range_end = util::get_prefix_end_key(prefix);
    assert!(kv_backend
-        .put(PutRequest {
-            key: b"key11".to_vec(),
-            value: b"val11".to_vec(),
+        .delete_range(DeleteRangeRequest {
+            key: prefix.to_vec(),
+            range_end,
            ..Default::default()
        })
        .await
        .is_ok());
 }

-pub async fn test_kv_put(kv_backend: impl KvBackend) {
+pub async fn test_kv_put(kv_backend: &impl KvBackend) {
+    test_kv_put_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn test_kv_put_with_prefix(kv_backend: &impl KvBackend, prefix: Vec<u8>) {
+    let put_key = [prefix.clone(), b"key11".to_vec()].concat();
    let resp = kv_backend
        .put(PutRequest {
-            key: b"key11".to_vec(),
+            key: put_key.clone(),
            value: b"val12".to_vec(),
            prev_kv: false,
        })
@@ -71,20 +87,25 @@ pub async fn test_kv_put(kv_backend: impl KvBackend) {

    let resp = kv_backend
        .put(PutRequest {
-            key: b"key11".to_vec(),
+            key: put_key.clone(),
            value: b"val13".to_vec(),
            prev_kv: true,
        })
        .await
        .unwrap();
    let prev_kv = resp.prev_kv.unwrap();
-    assert_eq!(b"key11", prev_kv.key());
+    assert_eq!(put_key, prev_kv.key());
    assert_eq!(b"val12", prev_kv.value());
 }

-pub async fn test_kv_range(kv_backend: impl KvBackend) {
-    let key = b"key1".to_vec();
-    let range_end = util::get_prefix_end_key(b"key1");
+pub async fn test_kv_range(kv_backend: &impl KvBackend) {
+    test_kv_range_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn test_kv_range_with_prefix(kv_backend: &impl KvBackend, prefix: Vec<u8>) {
+    let key = [prefix.clone(), b"key1".to_vec()].concat();
+    let key11 = [prefix.clone(), b"key11".to_vec()].concat();
+    let range_end = util::get_prefix_end_key(&key);

    let resp = kv_backend
        .range(RangeRequest {
@@ -97,9 +118,9 @@ pub async fn test_kv_range(kv_backend: impl KvBackend) {
        .unwrap();

    assert_eq!(2, resp.kvs.len());
-    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(key, resp.kvs[0].key);
    assert_eq!(b"val1", resp.kvs[0].value());
-    assert_eq!(b"key11", resp.kvs[1].key());
+    assert_eq!(key11, resp.kvs[1].key);
    assert_eq!(b"val11", resp.kvs[1].value());

    let resp = kv_backend
@@ -113,9 +134,9 @@ pub async fn test_kv_range(kv_backend: impl KvBackend) {
        .unwrap();

    assert_eq!(2, resp.kvs.len());
-    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(key, resp.kvs[0].key);
    assert_eq!(b"", resp.kvs[0].value());
-    assert_eq!(b"key11", resp.kvs[1].key());
+    assert_eq!(key11, resp.kvs[1].key);
    assert_eq!(b"", resp.kvs[1].value());

    let resp = kv_backend
@@ -129,12 +150,12 @@ pub async fn test_kv_range(kv_backend: impl KvBackend) {
        .unwrap();

    assert_eq!(1, resp.kvs.len());
-    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(key, resp.kvs[0].key);
    assert_eq!(b"val1", resp.kvs[0].value());

    let resp = kv_backend
        .range(RangeRequest {
-            key,
+            key: key.clone(),
            range_end,
            limit: 1,
            keys_only: false,
@@ -143,24 +164,41 @@ pub async fn test_kv_range(kv_backend: impl KvBackend) {
        .unwrap();

    assert_eq!(1, resp.kvs.len());
-    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(key, resp.kvs[0].key);
    assert_eq!(b"val1", resp.kvs[0].value());
 }

 pub async fn test_kv_range_2(kv_backend: impl KvBackend) {
+    test_kv_range_2_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn test_kv_range_2_with_prefix(kv_backend: impl KvBackend, prefix: Vec<u8>) {
+    let atest = [prefix.clone(), b"atest".to_vec()].concat();
+    let test = [prefix.clone(), b"test".to_vec()].concat();
+
    kv_backend
-        .put(PutRequest::new().with_key("atest").with_value("value"))
+        .put(
+            PutRequest::new()
+                .with_key(atest.clone())
+                .with_value("value"),
+        )
        .await
        .unwrap();

    kv_backend
-        .put(PutRequest::new().with_key("test").with_value("value"))
+        .put(PutRequest::new().with_key(test.clone()).with_value("value"))
        .await
        .unwrap();

    // If both key and range_end are ‘\0’, then range represents all keys.
+    let all_start = [prefix.clone(), b"\0".to_vec()].concat();
+    let all_end = if prefix.is_empty() {
+        b"\0".to_vec()
+    } else {
+        util::get_prefix_end_key(&prefix)
+    };
    let result = kv_backend
-        .range(RangeRequest::new().with_range(b"\0".to_vec(), b"\0".to_vec()))
+        .range(RangeRequest::new().with_range(all_start, all_end.clone()))
        .await
        .unwrap();

@@ -168,26 +206,28 @@ pub async fn test_kv_range_2(kv_backend: impl KvBackend) {
    assert!(!result.more);

    // If range_end is ‘\0’, the range is all keys greater than or equal to the key argument.
+    let a_start = [prefix.clone(), b"a".to_vec()].concat();
    let result = kv_backend
-        .range(RangeRequest::new().with_range(b"a".to_vec(), b"\0".to_vec()))
+        .range(RangeRequest::new().with_range(a_start.clone(), all_end.clone()))
        .await
        .unwrap();

    assert_eq!(result.kvs.len(), 2);

+    let b_start = [prefix.clone(), b"b".to_vec()].concat();
    let result = kv_backend
-        .range(RangeRequest::new().with_range(b"b".to_vec(), b"\0".to_vec()))
+        .range(RangeRequest::new().with_range(b_start, all_end.clone()))
        .await
        .unwrap();

    assert_eq!(result.kvs.len(), 1);
-    assert_eq!(result.kvs[0].key, b"test");
+    assert_eq!(result.kvs[0].key, test);

    // Fetches the keys >= "a", set limit to 1, the `more` should be true.
    let result = kv_backend
        .range(
            RangeRequest::new()
-                .with_range(b"a".to_vec(), b"\0".to_vec())
+                .with_range(a_start.clone(), all_end.clone())
                .with_limit(1),
        )
        .await
@@ -199,7 +239,7 @@ pub async fn test_kv_range_2(kv_backend: impl KvBackend) {
    let result = kv_backend
        .range(
            RangeRequest::new()
-                .with_range(b"a".to_vec(), b"\0".to_vec())
+                .with_range(a_start.clone(), all_end.clone())
                .with_limit(2),
        )
        .await
@@ -211,16 +251,27 @@ pub async fn test_kv_range_2(kv_backend: impl KvBackend) {
    let result = kv_backend
        .range(
            RangeRequest::new()
-                .with_range(b"a".to_vec(), b"\0".to_vec())
+                .with_range(a_start.clone(), all_end.clone())
                .with_limit(3),
        )
        .await
        .unwrap();
    assert_eq!(result.kvs.len(), 2);
    assert!(!result.more);
+
+    let req = BatchDeleteRequest {
+        keys: vec![atest, test],
+        prev_kv: false,
+    };
+    let resp = kv_backend.batch_delete(req).await.unwrap();
+    assert!(resp.prev_kvs.is_empty());
 }

-pub async fn test_kv_batch_get(kv_backend: impl KvBackend) {
+pub async fn test_kv_batch_get(kv_backend: &impl KvBackend) {
+    test_kv_batch_get_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn test_kv_batch_get_with_prefix(kv_backend: &impl KvBackend, prefix: Vec<u8>) {
    let keys = vec![];
    let resp = kv_backend
        .batch_get(BatchGetRequest { keys })
@@ -229,7 +280,8 @@ pub async fn test_kv_batch_get(kv_backend: impl KvBackend) {

    assert!(resp.kvs.is_empty());

-    let keys = vec![b"key10".to_vec()];
+    let key10 = [prefix.clone(), b"key10".to_vec()].concat();
+    let keys = vec![key10];
    let resp = kv_backend
        .batch_get(BatchGetRequest { keys })
        .await
@@ -237,29 +289,42 @@ pub async fn test_kv_batch_get(kv_backend: impl KvBackend) {

    assert!(resp.kvs.is_empty());

-    let keys = vec![b"key1".to_vec(), b"key3".to_vec(), b"key4".to_vec()];
+    let key1 = [prefix.clone(), b"key1".to_vec()].concat();
+    let key3 = [prefix.clone(), b"key3".to_vec()].concat();
+    let key4 = [prefix.clone(), b"key4".to_vec()].concat();
+    let keys = vec![key1.clone(), key3.clone(), key4];
    let resp = kv_backend
        .batch_get(BatchGetRequest { keys })
        .await
        .unwrap();

    assert_eq!(2, resp.kvs.len());
-    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(key1, resp.kvs[0].key);
    assert_eq!(b"val1", resp.kvs[0].value());
-    assert_eq!(b"key3", resp.kvs[1].key());
+    assert_eq!(key3, resp.kvs[1].key);
    assert_eq!(b"val3", resp.kvs[1].value());
 }

 pub async fn test_kv_compare_and_put(kv_backend: Arc<dyn KvBackend<Error = Error>>) {
+    test_kv_compare_and_put_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn test_kv_compare_and_put_with_prefix(
+    kv_backend: Arc<dyn KvBackend<Error = Error>>,
+    prefix: Vec<u8>,
+) {
    let success = Arc::new(AtomicU8::new(0));
+    let key = [prefix.clone(), b"key".to_vec()].concat();

    let mut joins = vec![];
    for _ in 0..20 {
        let kv_backend_clone = kv_backend.clone();
        let success_clone = success.clone();
+        let key_clone = key.clone();
+
        let join = tokio::spawn(async move {
            let req = CompareAndPutRequest {
-                key: b"key".to_vec(),
+                key: key_clone,
                expect: vec![],
                value: b"val_new".to_vec(),
            };
@@ -276,11 +341,19 @@ pub async fn test_kv_compare_and_put(kv_backend: Arc<dyn KvBackend<Error = Error
    }

    assert_eq!(1, success.load(Ordering::SeqCst));
+
+    let resp = kv_backend.delete(&key, false).await.unwrap();
+    assert!(resp.is_none());
 }

 pub async fn test_kv_delete_range(kv_backend: impl KvBackend) {
+    test_kv_delete_range_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn test_kv_delete_range_with_prefix(kv_backend: impl KvBackend, prefix: Vec<u8>) {
+    let key3 = [prefix.clone(), b"key3".to_vec()].concat();
    let req = DeleteRangeRequest {
-        key: b"key3".to_vec(),
+        key: key3.clone(),
        range_end: vec![],
        prev_kv: true,
    };
@@ -288,14 +361,15 @@ pub async fn test_kv_delete_range(kv_backend: impl KvBackend) {
    let resp = kv_backend.delete_range(req).await.unwrap();
    assert_eq!(1, resp.prev_kvs.len());
    assert_eq!(1, resp.deleted);
-    assert_eq!(b"key3", resp.prev_kvs[0].key());
+    assert_eq!(key3, resp.prev_kvs[0].key);
    assert_eq!(b"val3", resp.prev_kvs[0].value());

-    let resp = kv_backend.get(b"key3").await.unwrap();
+    let resp = kv_backend.get(&key3).await.unwrap();
    assert!(resp.is_none());

+    let key2 = [prefix.clone(), b"key2".to_vec()].concat();
    let req = DeleteRangeRequest {
-        key: b"key2".to_vec(),
+        key: key2.clone(),
        range_end: vec![],
        prev_kv: false,
    };
@@ -304,11 +378,11 @@ pub async fn test_kv_delete_range(kv_backend: impl KvBackend) {
    assert_eq!(1, resp.deleted);
    assert!(resp.prev_kvs.is_empty());

-    let resp = kv_backend.get(b"key2").await.unwrap();
+    let resp = kv_backend.get(&key2).await.unwrap();
    assert!(resp.is_none());

-    let key = b"key1".to_vec();
-    let range_end = util::get_prefix_end_key(b"key1");
+    let key = [prefix.clone(), b"key1".to_vec()].concat();
+    let range_end = util::get_prefix_end_key(&key);

    let req = DeleteRangeRequest {
        key: key.clone(),
@@ -328,34 +402,45 @@ pub async fn test_kv_delete_range(kv_backend: impl KvBackend) {
 }

 pub async fn test_kv_batch_delete(kv_backend: impl KvBackend) {
-    assert!(kv_backend.get(b"key1").await.unwrap().is_some());
-    assert!(kv_backend.get(b"key100").await.unwrap().is_none());
+    test_kv_batch_delete_with_prefix(kv_backend, vec![]).await;
+}
+
+pub async fn test_kv_batch_delete_with_prefix(kv_backend: impl KvBackend, prefix: Vec<u8>) {
+    let key1 = [prefix.clone(), b"key1".to_vec()].concat();
+    let key100 = [prefix.clone(), b"key100".to_vec()].concat();
+    assert!(kv_backend.get(&key1).await.unwrap().is_some());
+    assert!(kv_backend.get(&key100).await.unwrap().is_none());

    let req = BatchDeleteRequest {
-        keys: vec![b"key1".to_vec(), b"key100".to_vec()],
+        keys: vec![key1.clone(), key100.clone()],
        prev_kv: true,
    };
    let resp = kv_backend.batch_delete(req).await.unwrap();
    assert_eq!(1, resp.prev_kvs.len());
    assert_eq!(
        vec![KeyValue {
-            key: b"key1".to_vec(),
+            key: key1.clone(),
            value: b"val1".to_vec()
        }],
        resp.prev_kvs
    );
-    assert!(kv_backend.get(b"key1").await.unwrap().is_none());
+    assert!(kv_backend.get(&key1).await.unwrap().is_none());

-    assert!(kv_backend.get(b"key2").await.unwrap().is_some());
-    assert!(kv_backend.get(b"key3").await.unwrap().is_some());
+    let key2 = [prefix.clone(), b"key2".to_vec()].concat();
+    let key3 = [prefix.clone(), b"key3".to_vec()].concat();
+    let key11 = [prefix.clone(), b"key11".to_vec()].concat();
+    assert!(kv_backend.get(&key2).await.unwrap().is_some());
+    assert!(kv_backend.get(&key3).await.unwrap().is_some());
+    assert!(kv_backend.get(&key11).await.unwrap().is_some());

    let req = BatchDeleteRequest {
-        keys: vec![b"key2".to_vec(), b"key3".to_vec()],
+        keys: vec![key2.clone(), key3.clone(), key11.clone()],
        prev_kv: false,
    };
    let resp = kv_backend.batch_delete(req).await.unwrap();
    assert!(resp.prev_kvs.is_empty());

-    assert!(kv_backend.get(b"key2").await.unwrap().is_none());
-    assert!(kv_backend.get(b"key3").await.unwrap().is_none());
+    assert!(kv_backend.get(&key2).await.unwrap().is_none());
+    assert!(kv_backend.get(&key3).await.unwrap().is_none());
+    assert!(kv_backend.get(&key11).await.unwrap().is_none());
 }
--- a/src/common/meta/src/lib.rs
+++ b/src/common/meta/src/lib.rs
@@ -16,6 +16,7 @@
 #![feature(btree_extract_if)]
 #![feature(async_closure)]
 #![feature(let_chains)]
+#![feature(extract_if)]

 pub mod cache_invalidator;
 pub mod cluster;
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -474,6 +474,16 @@ pub struct AlterTableTask {
 }

 impl AlterTableTask {
+    pub fn validate(&self) -> Result<()> {
+        self.alter_table
+            .kind
+            .as_ref()
+            .context(error::UnexpectedSnafu {
+                err_msg: "'kind' is absent",
+            })?;
+        Ok(())
+    }
+
    pub fn table_ref(&self) -> TableReference {
        TableReference {
            catalog: &self.alter_table.catalog_name,
@@ -625,7 +635,7 @@ impl TryFrom<PbCreateDatabaseTask> for CreateDatabaseTask {
    fn try_from(pb: PbCreateDatabaseTask) -> Result<Self> {
        let CreateDatabaseExpr {
            catalog_name,
-            database_name,
+            schema_name,
            create_if_not_exists,
            options,
        } = pb.create_database.context(error::InvalidProtoMsgSnafu {
@@ -634,7 +644,7 @@ impl TryFrom<PbCreateDatabaseTask> for CreateDatabaseTask {

        Ok(CreateDatabaseTask {
            catalog: catalog_name,
-            schema: database_name,
+            schema: schema_name,
            create_if_not_exists,
            options: Some(options),
        })
@@ -655,7 +665,7 @@ impl TryFrom<CreateDatabaseTask> for PbCreateDatabaseTask {
        Ok(PbCreateDatabaseTask {
            create_database: Some(CreateDatabaseExpr {
                catalog_name: catalog,
-                database_name: schema,
+                schema_name: schema,
                create_if_not_exists,
                options: options.unwrap_or_default(),
            }),
--- a/src/common/meta/src/rpc/procedure.rs
+++ b/src/common/meta/src/rpc/procedure.rs
@@ -56,6 +56,12 @@ pub fn procedure_state_to_pb_response(state: &ProcedureState) -> PbProcedureStat
        ProcedureState::Done { .. } => (PbProcedureStatus::Done, String::default()),
        ProcedureState::Retrying { error } => (PbProcedureStatus::Retrying, error.to_string()),
        ProcedureState::Failed { error } => (PbProcedureStatus::Failed, error.to_string()),
+        ProcedureState::PrepareRollback { error } => {
+            (PbProcedureStatus::PrepareRollback, error.to_string())
+        }
+        ProcedureState::RollingBack { error } => {
+            (PbProcedureStatus::RollingBack, error.to_string())
+        }
    };

    PbProcedureStateResponse {
--- a/src/common/meta/src/test_util.rs
+++ b/src/common/meta/src/test_util.rs
@@ -27,6 +27,7 @@ use crate::ddl::DdlContext;
 use crate::error::Result;
 use crate::key::TableMetadataManager;
 use crate::kv_backend::memory::MemoryKvBackend;
+use crate::kv_backend::KvBackendRef;
 use crate::peer::Peer;
 use crate::region_keeper::MemoryRegionKeeper;
 use crate::sequence::SequenceBuilder;
@@ -86,6 +87,14 @@ impl<T: MockDatanodeHandler + 'static> DatanodeManager for MockDatanodeManager<T
 /// Returns a test purpose [DdlContext].
 pub fn new_ddl_context(datanode_manager: DatanodeManagerRef) -> DdlContext {
    let kv_backend = Arc::new(MemoryKvBackend::new());
+    new_ddl_context_with_kv_backend(datanode_manager, kv_backend)
+}
+
+/// Returns a test purpose [DdlContext] with a specified [KvBackendRef].
+pub fn new_ddl_context_with_kv_backend(
+    datanode_manager: DatanodeManagerRef,
+    kv_backend: KvBackendRef,
+) -> DdlContext {
    let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone()));

    DdlContext {
--- a/src/common/meta/src/wal_options_allocator.rs
+++ b/src/common/meta/src/wal_options_allocator.rs
@@ -17,7 +17,7 @@ pub mod kafka;
 use std::collections::HashMap;
 use std::sync::Arc;

-use common_wal::config::MetaSrvWalConfig;
+use common_wal::config::MetasrvWalConfig;
 use common_wal::options::{KafkaWalOptions, WalOptions, WAL_OPTIONS_KEY};
 use snafu::ResultExt;
 use store_api::storage::{RegionId, RegionNumber};
@@ -39,10 +39,10 @@ pub type WalOptionsAllocatorRef = Arc<WalOptionsAllocator>;

 impl WalOptionsAllocator {
    /// Creates a WalOptionsAllocator.
-    pub fn new(config: MetaSrvWalConfig, kv_backend: KvBackendRef) -> Self {
+    pub fn new(config: MetasrvWalConfig, kv_backend: KvBackendRef) -> Self {
        match config {
-            MetaSrvWalConfig::RaftEngine => Self::RaftEngine,
-            MetaSrvWalConfig::Kafka(kafka_config) => {
+            MetasrvWalConfig::RaftEngine => Self::RaftEngine,
+            MetasrvWalConfig::Kafka(kafka_config) => {
                Self::Kafka(KafkaTopicManager::new(kafka_config, kv_backend))
            }
        }
@@ -118,7 +118,7 @@ pub fn prepare_wal_options(

 #[cfg(test)]
 mod tests {
-    use common_wal::config::kafka::MetaSrvKafkaConfig;
+    use common_wal::config::kafka::MetasrvKafkaConfig;
    use common_wal::test_util::run_test_with_kafka_wal;

    use super::*;
@@ -129,7 +129,7 @@ mod tests {
    #[tokio::test]
    async fn test_allocator_with_raft_engine() {
        let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
-        let wal_config = MetaSrvWalConfig::RaftEngine;
+        let wal_config = MetasrvWalConfig::RaftEngine;
        let allocator = WalOptionsAllocator::new(wal_config, kv_backend);
        allocator.start().await.unwrap();

@@ -155,7 +155,7 @@ mod tests {
                    .collect::<Vec<_>>();

                // Creates a topic manager.
-                let config = MetaSrvKafkaConfig {
+                let config = MetasrvKafkaConfig {
                    replication_factor: broker_endpoints.len() as i16,
                    broker_endpoints,
                    ..Default::default()
--- a/src/common/meta/src/wal_options_allocator/kafka/topic_manager.rs
+++ b/src/common/meta/src/wal_options_allocator/kafka/topic_manager.rs
@@ -16,7 +16,7 @@ use std::collections::HashSet;
 use std::sync::Arc;

 use common_telemetry::{error, info};
-use common_wal::config::kafka::MetaSrvKafkaConfig;
+use common_wal::config::kafka::MetasrvKafkaConfig;
 use common_wal::TopicSelectorType;
 use rskafka::client::controller::ControllerClient;
 use rskafka::client::error::Error as RsKafkaError;
@@ -46,7 +46,7 @@ const DEFAULT_PARTITION: i32 = 0;

 /// Manages topic initialization and selection.
 pub struct TopicManager {
-    config: MetaSrvKafkaConfig,
+    config: MetasrvKafkaConfig,
    pub(crate) topic_pool: Vec<String>,
    pub(crate) topic_selector: TopicSelectorRef,
    kv_backend: KvBackendRef,
@@ -54,7 +54,7 @@ pub struct TopicManager {

 impl TopicManager {
    /// Creates a new topic manager.
-    pub fn new(config: MetaSrvKafkaConfig, kv_backend: KvBackendRef) -> Self {
+    pub fn new(config: MetasrvKafkaConfig, kv_backend: KvBackendRef) -> Self {
        // Topics should be created.
        let topics = (0..config.num_topics)
            .map(|topic_id| format!("{}_{topic_id}", config.topic_name_prefix))
@@ -283,7 +283,7 @@ mod tests {
                    .collect::<Vec<_>>();

                // Creates a topic manager.
-                let config = MetaSrvKafkaConfig {
+                let config = MetasrvKafkaConfig {
                    replication_factor: broker_endpoints.len() as i16,
                    broker_endpoints,
                    ..Default::default()
--- a/src/common/procedure/src/error.rs
+++ b/src/common/procedure/src/error.rs
@@ -104,12 +104,24 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Rollback Procedure recovered: {error}"))]
+    RollbackProcedureRecovered { error: String, location: Location },
+
    #[snafu(display("Procedure retry exceeded max times, procedure_id: {}", procedure_id))]
    RetryTimesExceeded {
        source: Arc<Error>,
        procedure_id: ProcedureId,
    },

+    #[snafu(display(
+        "Procedure rollback exceeded max times, procedure_id: {}",
+        procedure_id
+    ))]
+    RollbackTimesExceeded {
+        source: Arc<Error>,
+        procedure_id: ProcedureId,
+    },
+
    #[snafu(display("Corrupted data, error: "))]
    CorruptedData {
        #[snafu(source)]
@@ -145,6 +157,9 @@ pub enum Error {

    #[snafu(display("Unexpected: {err_msg}"))]
    Unexpected { location: Location, err_msg: String },
+
+    #[snafu(display("Not support to rollback the procedure"))]
+    RollbackNotSupported { location: Location },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -161,9 +176,12 @@ impl ErrorExt for Error {
            | Error::DeleteState { .. }
            | Error::FromJson { .. }
            | Error::RetryTimesExceeded { .. }
+            | Error::RollbackTimesExceeded { .. }
            | Error::RetryLater { .. }
            | Error::WaitWatcher { .. }
-            | Error::ManagerNotStart { .. } => StatusCode::Internal,
+            | Error::ManagerNotStart { .. }
+            | Error::RollbackProcedureRecovered { .. }
+            | Error::RollbackNotSupported { .. } => StatusCode::Internal,
            Error::LoaderConflict { .. } | Error::DuplicateProcedure { .. } => {
                StatusCode::InvalidArguments
            }
--- a/src/common/procedure/src/local.rs
+++ b/src/common/procedure/src/local.rs
@@ -31,11 +31,11 @@ use tokio::sync::{Mutex as TokioMutex, Notify};

 use self::rwlock::KeyRwLock;
 use crate::error::{
-    DuplicateProcedureSnafu, Error, LoaderConflictSnafu, ManagerNotStartSnafu, Result,
+    self, DuplicateProcedureSnafu, Error, LoaderConflictSnafu, ManagerNotStartSnafu, Result,
    StartRemoveOutdatedMetaTaskSnafu, StopRemoveOutdatedMetaTaskSnafu,
 };
 use crate::local::runner::Runner;
-use crate::procedure::BoxedProcedureLoader;
+use crate::procedure::{BoxedProcedureLoader, InitProcedureState};
 use crate::store::{ProcedureMessage, ProcedureStore, StateStoreRef};
 use crate::{
    BoxedProcedure, ContextProvider, LockKey, ProcedureId, ProcedureManager, ProcedureState,
@@ -72,8 +72,13 @@ pub(crate) struct ProcedureMeta {
 }

 impl ProcedureMeta {
-    fn new(id: ProcedureId, parent_id: Option<ProcedureId>, lock_key: LockKey) -> ProcedureMeta {
-        let (state_sender, state_receiver) = watch::channel(ProcedureState::Running);
+    fn new(
+        id: ProcedureId,
+        procedure_state: ProcedureState,
+        parent_id: Option<ProcedureId>,
+        lock_key: LockKey,
+    ) -> ProcedureMeta {
+        let (state_sender, state_receiver) = watch::channel(procedure_state);
        ProcedureMeta {
            id,
            parent_id,
@@ -424,12 +429,18 @@ impl LocalManager {
    fn submit_root(
        &self,
        procedure_id: ProcedureId,
+        procedure_state: ProcedureState,
        step: u32,
        procedure: BoxedProcedure,
    ) -> Result<Watcher> {
        ensure!(self.manager_ctx.running(), ManagerNotStartSnafu);

-        let meta = Arc::new(ProcedureMeta::new(procedure_id, None, procedure.lock_key()));
+        let meta = Arc::new(ProcedureMeta::new(
+            procedure_id,
+            procedure_state,
+            None,
+            procedure.lock_key(),
+        ));
        let runner = Runner {
            meta: meta.clone(),
            procedure,
@@ -468,13 +479,11 @@ impl LocalManager {
        Ok(watcher)
    }

-    /// Recovers unfinished procedures and reruns them.
-    async fn recover(&self) -> Result<()> {
-        logging::info!("LocalManager start to recover");
-        let recover_start = Instant::now();
-
-        let (messages, finished_ids) = self.procedure_store.load_messages().await?;
-
+    fn submit_recovered_messages(
+        &self,
+        messages: HashMap<ProcedureId, ProcedureMessage>,
+        init_state: InitProcedureState,
+    ) {
        for (procedure_id, message) in &messages {
            if message.parent_id.is_none() {
                // This is the root procedure. We only submit the root procedure as it will
@@ -494,8 +503,21 @@ impl LocalManager {
                    loaded_procedure.step
                );

+                let procedure_state = match init_state {
+                    InitProcedureState::RollingBack => ProcedureState::RollingBack {
+                        error: Arc::new(
+                            error::RollbackProcedureRecoveredSnafu {
+                                error: message.error.clone().unwrap_or("Unknown error".to_string()),
+                            }
+                            .build(),
+                        ),
+                    },
+                    InitProcedureState::Running => ProcedureState::Running,
+                };
+
                if let Err(e) = self.submit_root(
                    *procedure_id,
+                    procedure_state,
                    loaded_procedure.step,
                    loaded_procedure.procedure,
                ) {
@@ -503,6 +525,18 @@ impl LocalManager {
                }
            }
        }
+    }
+
+    /// Recovers unfinished procedures and reruns them.
+    async fn recover(&self) -> Result<()> {
+        logging::info!("LocalManager start to recover");
+        let recover_start = Instant::now();
+
+        let (messages, rollback_messages, finished_ids) =
+            self.procedure_store.load_messages().await?;
+        // Submits recovered messages first.
+        self.submit_recovered_messages(rollback_messages, InitProcedureState::RollingBack);
+        self.submit_recovered_messages(messages, InitProcedureState::Running);

        if !finished_ids.is_empty() {
            logging::info!(
@@ -587,7 +621,12 @@ impl ProcedureManager for LocalManager {
            DuplicateProcedureSnafu { procedure_id }
        );

-        self.submit_root(procedure.id, 0, procedure.procedure)
+        self.submit_root(
+            procedure.id,
+            ProcedureState::Running,
+            0,
+            procedure.procedure,
+        )
    }

    async fn procedure_state(&self, procedure_id: ProcedureId) -> Result<Option<ProcedureState>> {
@@ -626,7 +665,12 @@ pub(crate) mod test_util {
    use super::*;

    pub(crate) fn procedure_meta_for_test() -> ProcedureMeta {
-        ProcedureMeta::new(ProcedureId::random(), None, LockKey::default())
+        ProcedureMeta::new(
+            ProcedureId::random(),
+            ProcedureState::Running,
+            None,
+            LockKey::default(),
+        )
    }

    pub(crate) fn new_object_store(dir: &TempDir) -> ObjectStore {
@@ -914,6 +958,14 @@ mod tests {
                }
            }

+            async fn rollback(&mut self, _: &Context) -> Result<()> {
+                Ok(())
+            }
+
+            fn rollback_supported(&self) -> bool {
+                true
+            }
+
            fn dump(&self) -> Result<String> {
                Ok(String::new())
            }
@@ -923,24 +975,29 @@ mod tests {
            }
        }

-        let check_procedure = |procedure| {
-            async {
-                let procedure_id = ProcedureId::random();
-                let mut watcher = manager
-                    .submit(ProcedureWithId {
-                        id: procedure_id,
-                        procedure: Box::new(procedure),
-                    })
-                    .await
-                    .unwrap();
-                // Wait for the notification.
-                watcher.changed().await.unwrap();
-                assert!(watcher.borrow().is_failed());
-            }
+        let check_procedure = |procedure| async {
+            let procedure_id = ProcedureId::random();
+            manager
+                .submit(ProcedureWithId {
+                    id: procedure_id,
+                    procedure: Box::new(procedure),
+                })
+                .await
+                .unwrap()
        };

-        check_procedure(MockProcedure { panic: false }).await;
-        check_procedure(MockProcedure { panic: true }).await;
+        let mut watcher = check_procedure(MockProcedure { panic: false }).await;
+        // Wait for the notification.
+        watcher.changed().await.unwrap();
+        assert!(watcher.borrow().is_prepare_rollback());
+        watcher.changed().await.unwrap();
+        assert!(watcher.borrow().is_rolling_back());
+        watcher.changed().await.unwrap();
+        assert!(watcher.borrow().is_failed());
+        // The runner won't rollback a panicked procedure.
+        let mut watcher = check_procedure(MockProcedure { panic: true }).await;
+        watcher.changed().await.unwrap();
+        assert!(watcher.borrow().is_failed());
    }

    #[tokio::test]
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -23,37 +23,9 @@ use super::rwlock::OwnedKeyRwLockGuard;
 use crate::error::{self, ProcedurePanicSnafu, Result};
 use crate::local::{ManagerContext, ProcedureMeta, ProcedureMetaRef};
 use crate::procedure::{Output, StringKey};
-use crate::store::ProcedureStore;
-use crate::ProcedureState::Retrying;
+use crate::store::{ProcedureMessage, ProcedureStore};
 use crate::{BoxedProcedure, Context, Error, ProcedureId, ProcedureState, ProcedureWithId, Status};

-#[derive(Debug)]
-enum ExecResult {
-    Continue,
-    Done,
-    RetryLater,
-    Failed,
-}
-
-#[cfg(test)]
-impl ExecResult {
-    fn is_continue(&self) -> bool {
-        matches!(self, ExecResult::Continue)
-    }
-
-    fn is_done(&self) -> bool {
-        matches!(self, ExecResult::Done)
-    }
-
-    fn is_retry_later(&self) -> bool {
-        matches!(self, ExecResult::RetryLater)
-    }
-
-    fn is_failed(&self) -> bool {
-        matches!(self, ExecResult::Failed)
-    }
-}
-
 /// A guard to cleanup procedure state.
 struct ProcedureGuard {
    meta: ProcedureMetaRef,
@@ -208,129 +180,164 @@ impl Runner {
    async fn execute_once_with_retry(&mut self, ctx: &Context) {
        let mut retry = self.exponential_builder.build();
        let mut retry_times = 0;
+
+        let mut rollback = self.exponential_builder.build();
+        let mut rollback_times = 0;
+
        loop {
            // Don't store state if `ProcedureManager` is stopped.
            if !self.running() {
-                self.meta.set_state(ProcedureState::Failed {
-                    error: Arc::new(error::ManagerNotStartSnafu {}.build()),
-                });
+                self.meta.set_state(ProcedureState::failed(Arc::new(
+                    error::ManagerNotStartSnafu {}.build(),
+                )));
                return;
            }
-            match self.execute_once(ctx).await {
-                ExecResult::Done | ExecResult::Failed => return,
-                ExecResult::Continue => (),
-                ExecResult::RetryLater => {
+            let state = self.meta.state();
+            match state {
+                ProcedureState::Running => {}
+                ProcedureState::Retrying { error } => {
                    retry_times += 1;
                    if let Some(d) = retry.next() {
                        self.wait_on_err(d, retry_times).await;
                    } else {
-                        assert!(self.meta.state().is_retrying());
-                        if let Retrying { error } = self.meta.state() {
-                            self.meta.set_state(ProcedureState::failed(Arc::new(
+                        self.meta
+                            .set_state(ProcedureState::prepare_rollback(Arc::new(
                                Error::RetryTimesExceeded {
-                                    source: error,
+                                    source: error.clone(),
                                    procedure_id: self.meta.id,
                                },
-                            )))
-                        }
+                            )));
+                    }
+                }
+                ProcedureState::PrepareRollback { error }
+                | ProcedureState::RollingBack { error } => {
+                    rollback_times += 1;
+                    if let Some(d) = rollback.next() {
+                        self.wait_on_err(d, rollback_times).await;
+                    } else {
+                        self.meta.set_state(ProcedureState::failed(Arc::new(
+                            Error::RollbackTimesExceeded {
+                                source: error.clone(),
+                                procedure_id: self.meta.id,
+                            },
+                        )));
                        return;
                    }
                }
+                ProcedureState::Done { .. } => return,
+                ProcedureState::Failed { .. } => return,
+            }
+            self.execute_once(ctx).await;
+        }
+    }
+
+    async fn rollback(&mut self, ctx: &Context, err: Arc<Error>) {
+        if self.procedure.rollback_supported() {
+            if let Err(e) = self.procedure.rollback(ctx).await {
+                self.meta
+                    .set_state(ProcedureState::rolling_back(Arc::new(e)));
+                return;
            }
        }
+        self.meta.set_state(ProcedureState::failed(err));
    }

-    async fn rollback(&mut self, error: Arc<Error>) -> ExecResult {
-        if let Err(e) = self.rollback_procedure().await {
-            self.rolling_back = true;
-            self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
-            return ExecResult::RetryLater;
+    async fn prepare_rollback(&mut self, err: Arc<Error>) {
+        if let Err(e) = self.write_procedure_state(err.to_string()).await {
+            self.meta
+                .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+            return;
+        }
+        if self.procedure.rollback_supported() {
+            self.meta.set_state(ProcedureState::rolling_back(err));
+        } else {
+            self.meta.set_state(ProcedureState::failed(err));
        }
-        self.meta.set_state(ProcedureState::failed(error));
-        ExecResult::Failed
    }

-    async fn execute_once(&mut self, ctx: &Context) -> ExecResult {
-        // if rolling_back, there is no need to execute again.
-        if self.rolling_back {
-            // We can definitely get the previous error here.
-            let state = self.meta.state();
-            let err = state.error().unwrap();
-            return self.rollback(err.clone()).await;
-        }
-        match self.procedure.execute(ctx).await {
-            Ok(status) => {
-                logging::debug!(
-                    "Execute procedure {}-{} once, status: {:?}, need_persist: {}",
-                    self.procedure.type_name(),
-                    self.meta.id,
-                    status,
-                    status.need_persist(),
-                );
+    async fn execute_once(&mut self, ctx: &Context) {
+        match self.meta.state() {
+            ProcedureState::Running | ProcedureState::Retrying { .. } => {
+                match self.procedure.execute(ctx).await {
+                    Ok(status) => {
+                        logging::debug!(
+                            "Execute procedure {}-{} once, status: {:?}, need_persist: {}",
+                            self.procedure.type_name(),
+                            self.meta.id,
+                            status,
+                            status.need_persist(),
+                        );

-                // Don't store state if `ProcedureManager` is stopped.
-                if !self.running() {
-                    self.meta.set_state(ProcedureState::Failed {
-                        error: Arc::new(error::ManagerNotStartSnafu {}.build()),
-                    });
-                    return ExecResult::Failed;
-                }
-
-                if status.need_persist() {
-                    if let Err(err) = self.persist_procedure().await {
-                        self.meta.set_state(ProcedureState::retrying(Arc::new(err)));
-                        return ExecResult::RetryLater;
-                    }
-                }
-
-                match status {
-                    Status::Executing { .. } => (),
-                    Status::Suspended { subprocedures, .. } => {
-                        self.on_suspended(subprocedures).await;
-                    }
-                    Status::Done { output } => {
-                        if let Err(e) = self.commit_procedure().await {
-                            self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
-                            return ExecResult::RetryLater;
+                        // Don't store state if `ProcedureManager` is stopped.
+                        if !self.running() {
+                            self.meta.set_state(ProcedureState::failed(Arc::new(
+                                error::ManagerNotStartSnafu {}.build(),
+                            )));
+                            return;
                        }

-                        self.done(output);
-                        return ExecResult::Done;
+                        if status.need_persist() {
+                            if let Err(err) = self.persist_procedure().await {
+                                self.meta.set_state(ProcedureState::retrying(Arc::new(err)));
+                                return;
+                            }
+                        }
+
+                        match status {
+                            Status::Executing { .. } => (),
+                            Status::Suspended { subprocedures, .. } => {
+                                self.on_suspended(subprocedures).await;
+                            }
+                            Status::Done { output } => {
+                                if let Err(e) = self.commit_procedure().await {
+                                    self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
+                                    return;
+                                }
+
+                                self.done(output);
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        logging::error!(
+                            e;
+                            "Failed to execute procedure {}-{}, retry: {}",
+                            self.procedure.type_name(),
+                            self.meta.id,
+                            e.is_retry_later(),
+                        );
+
+                        // Don't store state if `ProcedureManager` is stopped.
+                        if !self.running() {
+                            self.meta.set_state(ProcedureState::failed(Arc::new(
+                                error::ManagerNotStartSnafu {}.build(),
+                            )));
+                            return;
+                        }
+
+                        if e.is_retry_later() {
+                            self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
+                            return;
+                        }
+
+                        self.meta
+                            .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
                    }
                }
-
-                ExecResult::Continue
-            }
-            Err(e) => {
-                logging::error!(
-                    e;
-                    "Failed to execute procedure {}-{}, retry: {}",
-                    self.procedure.type_name(),
-                    self.meta.id,
-                    e.is_retry_later(),
-                );
-
-                // Don't store state if `ProcedureManager` is stopped.
-                if !self.running() {
-                    self.meta.set_state(ProcedureState::Failed {
-                        error: Arc::new(error::ManagerNotStartSnafu {}.build()),
-                    });
-                    return ExecResult::Failed;
-                }
-
-                if e.is_retry_later() {
-                    self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
-                    return ExecResult::RetryLater;
-                }
-
-                // Write rollback key so we can skip this procedure while recovering procedures.
-                self.rollback(Arc::new(e)).await
            }
+            ProcedureState::PrepareRollback { error } => self.prepare_rollback(error).await,
+            ProcedureState::RollingBack { error } => self.rollback(ctx, error).await,
+            ProcedureState::Failed { .. } | ProcedureState::Done { .. } => (),
        }
    }

    /// Submit a subprocedure with specific `procedure_id`.
-    fn submit_subprocedure(&self, procedure_id: ProcedureId, mut procedure: BoxedProcedure) {
+    fn submit_subprocedure(
+        &self,
+        procedure_id: ProcedureId,
+        procedure_state: ProcedureState,
+        mut procedure: BoxedProcedure,
+    ) {
        if self.manager_ctx.contains_procedure(procedure_id) {
            // If the parent has already submitted this procedure, don't submit it again.
            return;
@@ -350,6 +357,7 @@ impl Runner {

        let meta = Arc::new(ProcedureMeta::new(
            procedure_id,
+            procedure_state,
            Some(self.meta.id),
            procedure.lock_key(),
        ));
@@ -407,7 +415,11 @@ impl Runner {
                subprocedure.id,
            );

-            self.submit_subprocedure(subprocedure.id, subprocedure.procedure);
+            self.submit_subprocedure(
+                subprocedure.id,
+                ProcedureState::Running,
+                subprocedure.procedure,
+            );
        }

        logging::info!(
@@ -469,9 +481,19 @@ impl Runner {
        Ok(())
    }

-    async fn rollback_procedure(&mut self) -> Result<()> {
+    async fn write_procedure_state(&mut self, error: String) -> Result<()> {
+        // Persists procedure state
+        let type_name = self.procedure.type_name().to_string();
+        let data = self.procedure.dump()?;
+        let message = ProcedureMessage {
+            type_name,
+            data,
+            parent_id: self.meta.parent_id,
+            step: self.step,
+            error: Some(error),
+        };
        self.store
-            .rollback_procedure(self.meta.id, self.step)
+            .rollback_procedure(self.meta.id, message)
            .await
            .map_err(|e| {
                logging::error!(
@@ -510,6 +532,7 @@ mod tests {
    use futures_util::future::BoxFuture;
    use futures_util::FutureExt;
    use object_store::ObjectStore;
+    use tokio::sync::mpsc;

    use super::*;
    use crate::local::test_util;
@@ -566,11 +589,13 @@ mod tests {
        }
    }

-    #[derive(Debug)]
+    type RollbackFn = Box<dyn FnMut(Context) -> BoxFuture<'static, Result<()>> + Send>;
+
    struct ProcedureAdapter<F> {
        data: String,
        lock_key: LockKey,
        exec_fn: F,
+        rollback_fn: Option<RollbackFn>,
    }

    impl<F> ProcedureAdapter<F> {
@@ -597,6 +622,17 @@ mod tests {
            f.await
        }

+        async fn rollback(&mut self, ctx: &Context) -> Result<()> {
+            if let Some(f) = &mut self.rollback_fn {
+                return (f)(ctx.clone()).await;
+            }
+            Ok(())
+        }
+
+        fn rollback_supported(&self) -> bool {
+            self.rollback_fn.is_some()
+        }
+
        fn dump(&self) -> Result<String> {
            Ok(self.data.clone())
        }
@@ -623,6 +659,7 @@ mod tests {
            data: "normal".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("normal");
@@ -633,8 +670,9 @@ mod tests {
        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
        runner.manager_ctx.start();

-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_continue(), "{res:?}");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
        check_files(
            &object_store,
            &procedure_store,
@@ -643,8 +681,9 @@ mod tests {
        )
        .await;

-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_done(), "{res:?}");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_done(), "{state:?}");
        check_files(
            &object_store,
            &procedure_store,
@@ -684,6 +723,7 @@ mod tests {
            data: "suspend".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("suspend");
@@ -694,8 +734,9 @@ mod tests {
        let mut runner = new_runner(meta, Box::new(suspend), procedure_store);
        runner.manager_ctx.start();

-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_continue(), "{res:?}");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
    }

    fn new_child_procedure(procedure_id: ProcedureId, keys: &[&str]) -> ProcedureWithId {
@@ -716,6 +757,7 @@ mod tests {
            data: "child".to_string(),
            lock_key: LockKey::new_exclusive(keys.iter().map(|k| k.to_string())),
            exec_fn,
+            rollback_fn: None,
        };

        ProcedureWithId {
@@ -784,6 +826,7 @@ mod tests {
            data: "parent".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("parent");
@@ -830,6 +873,7 @@ mod tests {
            data: "normal".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("test_running_is_stopped");
@@ -840,8 +884,9 @@ mod tests {
        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
        runner.manager_ctx.start();

-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_continue(), "{res:?}");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
        check_files(
            &object_store,
            &procedure_store,
@@ -851,8 +896,9 @@ mod tests {
        .await;

        runner.manager_ctx.stop();
-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_failed());
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_failed(), "{state:?}");
        // Shouldn't write any files
        check_files(
            &object_store,
@@ -871,6 +917,7 @@ mod tests {
            data: "fail".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("test_running_is_stopped_on_error");
@@ -881,8 +928,9 @@ mod tests {
        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
        runner.manager_ctx.stop();

-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_failed(), "{res:?}");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_failed(), "{state:?}");
        // Shouldn't write any files
        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
    }
@@ -895,6 +943,7 @@ mod tests {
            data: "fail".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("fail");
@@ -905,9 +954,53 @@ mod tests {
        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
        runner.manager_ctx.start();

-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_failed(), "{res:?}");
-        assert!(meta.state().is_failed());
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_prepare_rollback(), "{state:?}");
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_failed(), "{state:?}");
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.rollback"],
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_execute_with_rollback_on_error() {
+        let exec_fn =
+            |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
+        let rollback_fn = move |_| async move { Ok(()) }.boxed();
+        let fail = ProcedureAdapter {
+            data: "fail".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            exec_fn,
+            rollback_fn: Some(Box::new(rollback_fn)),
+        };
+
+        let dir = create_temp_dir("fail");
+        let meta = fail.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
+        runner.manager_ctx.start();
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_prepare_rollback(), "{state:?}");
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_rolling_back(), "{state:?}");
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_failed(), "{state:?}");
        check_files(
            &object_store,
            &procedure_store,
@@ -937,6 +1030,7 @@ mod tests {
            data: "retry_later".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("retry_later");
@@ -946,13 +1040,13 @@ mod tests {
        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
        runner.manager_ctx.start();
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_retrying(), "{state:?}");

-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_retry_later(), "{res:?}");
-        assert!(meta.state().is_retrying());
-
-        let res = runner.execute_once(&ctx).await;
-        assert!(res.is_done(), "{res:?}");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_done(), "{state:?}");
        assert!(meta.state().is_done());
        check_files(
            &object_store,
@@ -972,6 +1066,7 @@ mod tests {
            data: "exceed_max_retry_later".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("exceed_max_retry_later");
@@ -995,6 +1090,85 @@ mod tests {
        assert!(err.contains("Procedure retry exceeded max times"));
    }

+    #[tokio::test]
+    async fn test_rollback_exceed_max_retry_later() {
+        let exec_fn =
+            |_| async { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed();
+        let rollback_fn = move |_| {
+            async move { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed()
+        };
+        let exceed_max_retry_later = ProcedureAdapter {
+            data: "exceed_max_rollback".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            exec_fn,
+            rollback_fn: Some(Box::new(rollback_fn)),
+        };
+
+        let dir = create_temp_dir("exceed_max_rollback");
+        let meta = exceed_max_retry_later.new_meta(ROOT_ID);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(
+            meta.clone(),
+            Box::new(exceed_max_retry_later),
+            procedure_store,
+        );
+        runner.manager_ctx.start();
+        runner.exponential_builder = ExponentialBuilder::default()
+            .with_min_delay(Duration::from_millis(1))
+            .with_max_times(3);
+
+        // Run the runner and execute the procedure.
+        runner.execute_procedure_in_loop().await;
+        let err = meta.state().error().unwrap().to_string();
+        assert!(err.contains("Procedure rollback exceeded max times"));
+    }
+
+    #[tokio::test]
+    async fn test_rollback_after_retry_fail() {
+        let exec_fn = move |_| {
+            async move { Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) }.boxed()
+        };
+
+        let (tx, mut rx) = mpsc::channel(1);
+        let rollback_fn = move |_| {
+            let tx = tx.clone();
+            async move {
+                tx.send(()).await.unwrap();
+                Ok(())
+            }
+            .boxed()
+        };
+        let retry_later = ProcedureAdapter {
+            data: "rollback_after_retry_fail".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            exec_fn,
+            rollback_fn: Some(Box::new(rollback_fn)),
+        };
+
+        let dir = create_temp_dir("retry_later");
+        let meta = retry_later.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
+        runner.manager_ctx.start();
+        runner.exponential_builder = ExponentialBuilder::default()
+            .with_min_delay(Duration::from_millis(1))
+            .with_max_times(3);
+        // Run the runner and execute the procedure.
+        runner.execute_procedure_in_loop().await;
+        rx.recv().await.unwrap();
+        assert_eq!(rx.try_recv().unwrap_err(), mpsc::error::TryRecvError::Empty);
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.rollback"],
+        )
+        .await;
+    }
+
    #[tokio::test]
    async fn test_child_error() {
        let mut times = 0;
@@ -1013,6 +1187,7 @@ mod tests {
                        data: "fail".to_string(),
                        lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
                        exec_fn,
+                        rollback_fn: None,
                    };

                    Ok(Status::Suspended {
@@ -1047,6 +1222,7 @@ mod tests {
            data: "parent".to_string(),
            lock_key: LockKey::single_exclusive("catalog.schema.table"),
            exec_fn,
+            rollback_fn: None,
        };

        let dir = create_temp_dir("child_err");
--- a/src/common/procedure/src/procedure.rs
+++ b/src/common/procedure/src/procedure.rs
@@ -23,7 +23,7 @@ use smallvec::{smallvec, SmallVec};
 use snafu::{ResultExt, Snafu};
 use uuid::Uuid;

-use crate::error::{Error, Result};
+use crate::error::{self, Error, Result};
 use crate::watcher::Watcher;

 pub type Output = Arc<dyn Any + Send + Sync>;
@@ -125,6 +125,18 @@ pub trait Procedure: Send {
    /// The implementation must be idempotent.
    async fn execute(&mut self, ctx: &Context) -> Result<Status>;

+    /// Rollback the failed procedure.
+    ///
+    /// The implementation must be idempotent.
+    async fn rollback(&mut self, _: &Context) -> Result<()> {
+        error::RollbackNotSupportedSnafu {}.fail()
+    }
+
+    /// Indicates whether it supports rolling back the procedure.
+    fn rollback_supported(&self) -> bool {
+        false
+    }
+
    /// Dump the state of the procedure to a string.
    fn dump(&self) -> Result<String>;

@@ -289,6 +301,10 @@ pub enum ProcedureState {
    Done { output: Option<Output> },
    /// The procedure is failed and can be retried.
    Retrying { error: Arc<Error> },
+    /// The procedure is failed and commits state before rolling back the procedure.
+    PrepareRollback { error: Arc<Error> },
+    /// The procedure is failed and can be rollback.
+    RollingBack { error: Arc<Error> },
    /// The procedure is failed and cannot proceed anymore.
    Failed { error: Arc<Error> },
 }
@@ -299,6 +315,16 @@ impl ProcedureState {
        ProcedureState::Failed { error }
    }

+    /// Returns a [ProcedureState] with prepare rollback state.
+    pub fn prepare_rollback(error: Arc<Error>) -> ProcedureState {
+        ProcedureState::PrepareRollback { error }
+    }
+
+    /// Returns a [ProcedureState] with rolling back state.
+    pub fn rolling_back(error: Arc<Error>) -> ProcedureState {
+        ProcedureState::RollingBack { error }
+    }
+
    /// Returns a [ProcedureState] with retrying state.
    pub fn retrying(error: Arc<Error>) -> ProcedureState {
        ProcedureState::Retrying { error }
@@ -324,16 +350,34 @@ impl ProcedureState {
        matches!(self, ProcedureState::Retrying { .. })
    }

+    /// Returns true if the procedure state is rolling back.
+    pub fn is_rolling_back(&self) -> bool {
+        matches!(self, ProcedureState::RollingBack { .. })
+    }
+
+    /// Returns true if the procedure state is prepare rollback.
+    pub fn is_prepare_rollback(&self) -> bool {
+        matches!(self, ProcedureState::PrepareRollback { .. })
+    }
+
    /// Returns the error.
    pub fn error(&self) -> Option<&Arc<Error>> {
        match self {
            ProcedureState::Failed { error } => Some(error),
            ProcedureState::Retrying { error } => Some(error),
+            ProcedureState::RollingBack { error } => Some(error),
            _ => None,
        }
    }
 }

+/// The initial procedure state.
+#[derive(Debug, Clone)]
+pub enum InitProcedureState {
+    Running,
+    RollingBack,
+}
+
 // TODO(yingwen): Shutdown
 /// `ProcedureManager` executes [Procedure] submitted to it.
 #[async_trait]
--- a/src/common/procedure/src/store.rs
+++ b/src/common/procedure/src/store.rs
@@ -50,6 +50,9 @@ pub struct ProcedureMessage {
    pub parent_id: Option<ProcedureId>,
    /// Current step.
    pub step: u32,
+    /// Errors raised during the procedure.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub error: Option<String>,
 }

 /// Procedure storage layer.
@@ -85,6 +88,7 @@ impl ProcedureStore {
            data,
            parent_id,
            step,
+            error: None,
        };
        let key = ParsedKey {
            prefix: &self.proc_path,
@@ -122,16 +126,19 @@ impl ProcedureStore {
    pub(crate) async fn rollback_procedure(
        &self,
        procedure_id: ProcedureId,
-        step: u32,
+        message: ProcedureMessage,
    ) -> Result<()> {
        let key = ParsedKey {
            prefix: &self.proc_path,
            procedure_id,
-            step,
+            step: message.step,
            key_type: KeyType::Rollback,
        }
        .to_string();
-        self.store.put(&key, Vec::new()).await?;
+
+        self.store
+            .put(&key, serde_json::to_vec(&message).context(ToJsonSnafu)?)
+            .await?;

        Ok(())
    }
@@ -176,11 +183,18 @@ impl ProcedureStore {
        Ok(())
    }

-    /// Load procedures from the storage. Returns a map of uncommitted procedures and a list
-    /// of finished procedures' ids.
+    /// Load procedures from the storage.
+    /// Returns:
+    /// - a map of uncommitted procedures
+    /// - a map of rolling back procedures
+    /// - a list of finished procedures' ids
    pub(crate) async fn load_messages(
        &self,
-    ) -> Result<(HashMap<ProcedureId, ProcedureMessage>, Vec<ProcedureId>)> {
+    ) -> Result<(
+        HashMap<ProcedureId, ProcedureMessage>,
+        HashMap<ProcedureId, ProcedureMessage>,
+        Vec<ProcedureId>,
+    )> {
        // Track the key-value pair by procedure id.
        let mut procedure_key_values: HashMap<_, (ParsedKey, Vec<u8>)> = HashMap::new();

@@ -204,21 +218,33 @@ impl ProcedureStore {
        }

        let mut messages = HashMap::with_capacity(procedure_key_values.len());
+        let mut rollback_messages = HashMap::new();
        let mut finished_ids = Vec::new();
        for (procedure_id, (parsed_key, value)) in procedure_key_values {
-            if parsed_key.key_type == KeyType::Step {
-                let Some(message) = self.load_one_message(&parsed_key, &value) else {
-                    // We don't abort the loading process and just ignore errors to ensure all remaining
-                    // procedures are loaded.
-                    continue;
-                };
-                let _ = messages.insert(procedure_id, message);
-            } else {
-                finished_ids.push(procedure_id);
+            match parsed_key.key_type {
+                KeyType::Step => {
+                    let Some(message) = self.load_one_message(&parsed_key, &value) else {
+                        // We don't abort the loading process and just ignore errors to ensure all remaining
+                        // procedures are loaded.
+                        continue;
+                    };
+                    let _ = messages.insert(procedure_id, message);
+                }
+                KeyType::Commit => {
+                    finished_ids.push(procedure_id);
+                }
+                KeyType::Rollback => {
+                    let Some(message) = self.load_one_message(&parsed_key, &value) else {
+                        // We don't abort the loading process and just ignore errors to ensure all remaining
+                        // procedures are loaded.
+                        continue;
+                    };
+                    let _ = rollback_messages.insert(procedure_id, message);
+                }
            }
        }

-        Ok((messages, finished_ids))
+        Ok((messages, rollback_messages, finished_ids))
    }

    fn load_one_message(&self, key: &ParsedKey, value: &[u8]) -> Option<ProcedureMessage> {
@@ -430,6 +456,7 @@ mod tests {
            data: "no parent id".to_string(),
            parent_id: None,
            step: 4,
+            error: None,
        };

        let json = serde_json::to_string(&message).unwrap();
@@ -490,8 +517,9 @@ mod tests {
            .await
            .unwrap();

-        let (messages, finished) = store.load_messages().await.unwrap();
+        let (messages, rollback_messages, finished) = store.load_messages().await.unwrap();
        assert_eq!(1, messages.len());
+        assert!(rollback_messages.is_empty());
        assert!(finished.is_empty());
        let msg = messages.get(&procedure_id).unwrap();
        let expect = ProcedureMessage {
@@ -499,6 +527,7 @@ mod tests {
            data: "test store procedure".to_string(),
            parent_id: None,
            step: 0,
+            error: None,
        };
        assert_eq!(expect, *msg);
    }
@@ -518,8 +547,9 @@ mod tests {
            .unwrap();
        store.commit_procedure(procedure_id, 1).await.unwrap();

-        let (messages, finished) = store.load_messages().await.unwrap();
+        let (messages, rollback_messages, finished) = store.load_messages().await.unwrap();
        assert!(messages.is_empty());
+        assert!(rollback_messages.is_empty());
        assert_eq!(&[procedure_id], &finished[..]);
    }

@@ -533,14 +563,32 @@ mod tests {
        let type_name = procedure.type_name().to_string();
        let data = procedure.dump().unwrap();
        store
-            .store_procedure(procedure_id, 0, type_name, data, None)
+            .store_procedure(
+                procedure_id,
+                0,
+                type_name.to_string(),
+                data.to_string(),
+                None,
+            )
+            .await
+            .unwrap();
+        let message = ProcedureMessage {
+            type_name,
+            data,
+            parent_id: None,
+            step: 1,
+            error: None,
+        };
+        store
+            .rollback_procedure(procedure_id, message)
            .await
            .unwrap();
-        store.rollback_procedure(procedure_id, 1).await.unwrap();

-        let (messages, finished) = store.load_messages().await.unwrap();
+        let (messages, rollback_messages, finished) = store.load_messages().await.unwrap();
        assert!(messages.is_empty());
-        assert_eq!(&[procedure_id], &finished[..]);
+        assert_eq!(1, rollback_messages.len());
+        assert!(finished.is_empty());
+        assert!(rollback_messages.contains_key(&procedure_id));
    }

    #[tokio::test]
@@ -565,8 +613,9 @@ mod tests {

        store.delete_procedure(procedure_id).await.unwrap();

-        let (messages, finished) = store.load_messages().await.unwrap();
+        let (messages, rollback_messages, finished) = store.load_messages().await.unwrap();
        assert!(messages.is_empty());
+        assert!(rollback_messages.is_empty());
        assert!(finished.is_empty());
    }

@@ -595,8 +644,9 @@ mod tests {

        store.delete_procedure(procedure_id).await.unwrap();

-        let (messages, finished) = store.load_messages().await.unwrap();
+        let (messages, rollback_messages, finished) = store.load_messages().await.unwrap();
        assert!(messages.is_empty());
+        assert!(rollback_messages.is_empty());
        assert!(finished.is_empty());
    }

@@ -657,8 +707,9 @@ mod tests {
            .await
            .unwrap();

-        let (messages, finished) = store.load_messages().await.unwrap();
+        let (messages, rollback_messages, finished) = store.load_messages().await.unwrap();
        assert_eq!(2, messages.len());
+        assert!(rollback_messages.is_empty());
        assert_eq!(1, finished.len());

        let msg = messages.get(&id0).unwrap();
--- a/src/common/procedure/src/watcher.rs
+++ b/src/common/procedure/src/watcher.rs
@@ -37,6 +37,12 @@ pub async fn wait(watcher: &mut Watcher) -> Result<Option<Output>> {
            ProcedureState::Retrying { error } => {
                debug!("retrying, source: {}", error)
            }
+            ProcedureState::RollingBack { error } => {
+                debug!("rolling back, source: {:?}", error)
+            }
+            ProcedureState::PrepareRollback { error } => {
+                debug!("commit rollback, source: {}", error)
+            }
        }
    }
 }
--- a/src/common/substrait/src/lib.rs
+++ b/src/common/substrait/src/lib.rs
@@ -17,12 +17,12 @@
 mod df_substrait;
 pub mod error;
 pub mod extension_serializer;
-
 use std::sync::Arc;

 use async_trait::async_trait;
 use bytes::{Buf, Bytes};
 use datafusion::catalog::CatalogList;
+pub use substrait_proto;

 pub use crate::df_substrait::DFLogicalSubstraitConvertor;

--- a/src/common/wal/src/config.rs
+++ b/src/common/wal/src/config.rs
@@ -17,16 +17,16 @@ pub mod raft_engine;

 use serde::{Deserialize, Serialize};

-use crate::config::kafka::{DatanodeKafkaConfig, MetaSrvKafkaConfig, StandaloneKafkaConfig};
+use crate::config::kafka::{DatanodeKafkaConfig, MetasrvKafkaConfig, StandaloneKafkaConfig};
 use crate::config::raft_engine::RaftEngineConfig;

 /// Wal configurations for metasrv.
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Default)]
 #[serde(tag = "provider", rename_all = "snake_case")]
-pub enum MetaSrvWalConfig {
+pub enum MetasrvWalConfig {
    #[default]
    RaftEngine,
-    Kafka(MetaSrvKafkaConfig),
+    Kafka(MetasrvKafkaConfig),
 }

 /// Wal configurations for datanode.
@@ -57,11 +57,11 @@ impl Default for StandaloneWalConfig {
    }
 }

-impl From<StandaloneWalConfig> for MetaSrvWalConfig {
+impl From<StandaloneWalConfig> for MetasrvWalConfig {
    fn from(config: StandaloneWalConfig) -> Self {
        match config {
            StandaloneWalConfig::RaftEngine(_) => Self::RaftEngine,
-            StandaloneWalConfig::Kafka(config) => Self::Kafka(MetaSrvKafkaConfig {
+            StandaloneWalConfig::Kafka(config) => Self::Kafka(MetasrvKafkaConfig {
                broker_endpoints: config.broker_endpoints,
                num_topics: config.num_topics,
                selector_type: config.selector_type,
@@ -100,7 +100,7 @@ mod tests {

    use super::*;
    use crate::config::kafka::common::BackoffConfig;
-    use crate::config::{DatanodeKafkaConfig, MetaSrvKafkaConfig, StandaloneKafkaConfig};
+    use crate::config::{DatanodeKafkaConfig, MetasrvKafkaConfig, StandaloneKafkaConfig};
    use crate::TopicSelectorType;

    #[test]
@@ -109,8 +109,8 @@ mod tests {
        let toml_str = r#"
            provider = "raft_engine"
        "#;
-        let metasrv_wal_config: MetaSrvWalConfig = toml::from_str(toml_str).unwrap();
-        assert_eq!(metasrv_wal_config, MetaSrvWalConfig::RaftEngine);
+        let metasrv_wal_config: MetasrvWalConfig = toml::from_str(toml_str).unwrap();
+        assert_eq!(metasrv_wal_config, MetasrvWalConfig::RaftEngine);

        let datanode_wal_config: DatanodeWalConfig = toml::from_str(toml_str).unwrap();
        assert_eq!(
@@ -166,9 +166,9 @@ mod tests {
            backoff_deadline = "5mins"
        "#;

-        // Deserialized to MetaSrvWalConfig.
-        let metasrv_wal_config: MetaSrvWalConfig = toml::from_str(toml_str).unwrap();
-        let expected = MetaSrvKafkaConfig {
+        // Deserialized to MetasrvWalConfig.
+        let metasrv_wal_config: MetasrvWalConfig = toml::from_str(toml_str).unwrap();
+        let expected = MetasrvKafkaConfig {
            broker_endpoints: vec!["127.0.0.1:9092".to_string()],
            num_topics: 32,
            selector_type: TopicSelectorType::RoundRobin,
@@ -183,7 +183,7 @@ mod tests {
                deadline: Some(Duration::from_secs(60 * 5)),
            },
        };
-        assert_eq!(metasrv_wal_config, MetaSrvWalConfig::Kafka(expected));
+        assert_eq!(metasrv_wal_config, MetasrvWalConfig::Kafka(expected));

        // Deserialized to DatanodeWalConfig.
        let datanode_wal_config: DatanodeWalConfig = toml::from_str(toml_str).unwrap();
--- a/src/common/wal/src/config/kafka.rs
+++ b/src/common/wal/src/config/kafka.rs
@@ -18,5 +18,5 @@ pub mod metasrv;
 pub mod standalone;

 pub use datanode::DatanodeKafkaConfig;
-pub use metasrv::MetaSrvKafkaConfig;
+pub use metasrv::MetasrvKafkaConfig;
 pub use standalone::StandaloneKafkaConfig;
--- a/src/common/wal/src/config/kafka/metasrv.rs
+++ b/src/common/wal/src/config/kafka/metasrv.rs
@@ -22,7 +22,7 @@ use crate::{TopicSelectorType, BROKER_ENDPOINT, TOPIC_NAME_PREFIX};
 /// Kafka wal configurations for metasrv.
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 #[serde(default)]
-pub struct MetaSrvKafkaConfig {
+pub struct MetasrvKafkaConfig {
    /// The broker endpoints of the Kafka cluster.
    pub broker_endpoints: Vec<String>,
    /// The number of topics to be created upon start.
@@ -43,7 +43,7 @@ pub struct MetaSrvKafkaConfig {
    pub backoff: BackoffConfig,
 }

-impl Default for MetaSrvKafkaConfig {
+impl Default for MetasrvKafkaConfig {
    fn default() -> Self {
        let broker_endpoints = vec![BROKER_ENDPOINT.to_string()];
        let replication_factor = broker_endpoints.len() as i16;
--- a/src/datanode/src/config.rs
+++ b/src/datanode/src/config.rs
@@ -14,8 +14,6 @@

 //! Datanode configurations

-use std::time::Duration;
-
 use common_base::readable_size::ReadableSize;
 use common_grpc::channel_manager::{
    DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE, DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE,
@@ -65,13 +63,6 @@ impl ObjectStoreConfig {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(default)]
 pub struct StorageConfig {
-    /// Retention period for all tables.
-    ///
-    /// Default value is `None`, which means no TTL.
-    ///
-    /// The precedence order is: ttl in table options > global ttl.
-    #[serde(with = "humantime_serde")]
-    pub global_ttl: Option<Duration>,
    /// The working directory of database
    pub data_home: String,
    #[serde(flatten)]
@@ -82,7 +73,6 @@ pub struct StorageConfig {
 impl Default for StorageConfig {
    fn default() -> Self {
        Self {
-            global_ttl: None,
            data_home: DEFAULT_DATA_HOME.to_string(),
            store: ObjectStoreConfig::default(),
            providers: vec![],
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -545,9 +545,7 @@ impl RegionServerInner {
        match region_change {
            RegionChange::None => {}
            RegionChange::Register(_, _) | RegionChange::Deregisters => {
-                self.region_map
-                    .remove(&region_id)
-                    .map(|(id, engine)| engine.set_writable(id, false));
+                self.region_map.remove(&region_id);
            }
        }
    }
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -14,17 +14,30 @@ common-error.workspace = true
 common-macro.workspace = true
 common-telemetry.workspace = true
 common-time.workspace = true
+datafusion-substrait.workspace = true
 datatypes.workspace = true
 enum_dispatch = "0.3"
-hydroflow = "0.5.0"
+# This fork is simply for keeping our dependency in our org, and pin the version
+# it is the same with upstream repo
+datafusion-common.workspace = true
+datafusion-expr.workspace = true
+hydroflow = { git = "https://github.com/GreptimeTeam/hydroflow.git", rev = "ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94" }
 itertools.workspace = true
 num-traits = "0.2"
 serde.workspace = true
 servers.workspace = true
 smallvec.workspace = true
 snafu.workspace = true
+strum.workspace = true
+substrait.workspace = true
 tokio.workspace = true
 tonic.workspace = true

 [dev-dependencies]
+catalog.workspace = true
+common-catalog.workspace = true
+prost.workspace = true
+query.workspace = true
 serde_json = "1.0"
+session.workspace = true
+table.workspace = true
--- a/src/flow/clippy.toml
+++ b/src/flow/clippy.toml
@@ -0,0 +1,3 @@
+# Whether to only check for missing documentation in items visible within the current crate. For example, pub(crate) items. (default: false)
+# This is a config for clippy::missing_docs_in_private_items
+missing-docs-in-crate-items = true
--- a/src/flow/src/adapter/error.rs
+++ b/src/flow/src/adapter/error.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! Error definition for flow module
+
 use std::any::Any;

 use common_macro::stack_trace_debug;
@@ -25,6 +27,7 @@ use snafu::{Location, Snafu};

 use crate::expr::EvalError;

+/// This error is used to represent all possible errors that can occur in the flow module.
 #[derive(Snafu)]
 #[snafu(visibility(pub))]
 #[stack_trace_debug]
@@ -54,18 +57,49 @@ pub enum Error {

    #[snafu(display("No protobuf type for value: {value}"))]
    NoProtoType { value: Value, location: Location },
+
+    #[snafu(display("Not implement in flow: {reason}"))]
+    NotImplemented { reason: String, location: Location },
+
+    #[snafu(display("Flow plan error: {reason}"))]
+    Plan { reason: String, location: Location },
+
+    #[snafu(display("Unsupported temporal filter: {reason}"))]
+    UnsupportedTemporalFilter { reason: String, location: Location },
+
+    #[snafu(display("Datatypes error: {source} with extra message: {extra}"))]
+    Datatypes {
+        source: datatypes::Error,
+        extra: String,
+        location: Location,
+    },
+
+    #[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
+    Datafusion {
+        raw: datafusion_common::DataFusionError,
+        context: String,
+        location: Location,
+    },
 }

+/// Result type for flow module
 pub type Result<T> = std::result::Result<T, Error>;

 impl ErrorExt for Error {
    fn status_code(&self) -> StatusCode {
        match self {
-            Self::Eval { .. } | &Self::JoinTask { .. } => StatusCode::Internal,
+            Self::Eval { .. } | &Self::JoinTask { .. } | &Self::Datafusion { .. } => {
+                StatusCode::Internal
+            }
            &Self::TableAlreadyExist { .. } => StatusCode::TableAlreadyExists,
            Self::TableNotFound { .. } => StatusCode::TableNotFound,
-            &Self::InvalidQuery { .. } => StatusCode::PlanQuery,
+            &Self::InvalidQuery { .. } | &Self::Plan { .. } | &Self::Datatypes { .. } => {
+                StatusCode::PlanQuery
+            }
            Self::NoProtoType { .. } => StatusCode::Unexpected,
+            &Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
+                StatusCode::Unsupported
+            }
        }
    }

--- a/src/flow/src/compute.rs
+++ b/src/flow/src/compute.rs
@@ -0,0 +1,19 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Build and Compute the dataflow
+
+mod render;
+mod state;
+mod types;
--- a/src/flow/src/compute/render.rs
+++ b/src/flow/src/compute/render.rs
@@ -0,0 +1,536 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! In this file, `render` means convert a static `Plan` into a Executable Dataflow
+//!
+//! And the [`Context`] is the environment for the render process, it contains all the necessary information for the render process
+
+use std::cell::RefCell;
+use std::collections::{BTreeMap, VecDeque};
+use std::rc::Rc;
+
+use hydroflow::lattices::cc_traits::Get;
+use hydroflow::scheduled::graph::Hydroflow;
+use hydroflow::scheduled::graph_ext::GraphExt;
+use hydroflow::scheduled::port::{PortCtx, SEND};
+use itertools::Itertools;
+use snafu::{OptionExt, ResultExt};
+
+use super::state::Scheduler;
+use crate::adapter::error::{Error, EvalSnafu, InvalidQuerySnafu};
+use crate::compute::state::DataflowState;
+use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff};
+use crate::expr::{
+    self, EvalError, GlobalId, LocalId, MapFilterProject, MfpPlan, SafeMfpPlan, ScalarExpr,
+};
+use crate::plan::Plan;
+use crate::repr::{self, DiffRow, KeyValDiffRow, Row};
+use crate::utils::{ArrangeHandler, Arrangement};
+
+/// The Context for build a Operator with id of `GlobalId`
+pub struct Context<'referred, 'df> {
+    pub id: GlobalId,
+    pub df: &'referred mut Hydroflow<'df>,
+    pub compute_state: &'referred mut DataflowState,
+    /// a list of all collections being used in the operator
+    pub input_collection: BTreeMap<GlobalId, CollectionBundle>,
+    /// used by `Get`/`Let` Plan for getting/setting local variables
+    ///
+    /// TODO(discord9): consider if use Vec<(LocalId, CollectionBundle)> instead
+    local_scope: Vec<BTreeMap<LocalId, CollectionBundle>>,
+    // Collect all errors in this operator's evaluation
+    err_collector: ErrCollector,
+}
+
+impl<'referred, 'df> Drop for Context<'referred, 'df> {
+    fn drop(&mut self) {
+        for bundle in std::mem::take(&mut self.input_collection)
+            .into_values()
+            .chain(
+                std::mem::take(&mut self.local_scope)
+                    .into_iter()
+                    .flat_map(|v| v.into_iter())
+                    .map(|(_k, v)| v),
+            )
+        {
+            bundle.collection.into_inner().drop(self.df);
+            drop(bundle.arranged);
+        }
+        // The automatically generated "drop glue" which recursively calls the destructors of all the fields (including the now empty `input_collection`)
+    }
+}
+
+impl<'referred, 'df> Context<'referred, 'df> {
+    pub fn insert_global(&mut self, id: GlobalId, collection: CollectionBundle) {
+        self.input_collection.insert(id, collection);
+    }
+
+    pub fn insert_local(&mut self, id: LocalId, collection: CollectionBundle) {
+        if let Some(last) = self.local_scope.last_mut() {
+            last.insert(id, collection);
+        } else {
+            let first = BTreeMap::from([(id, collection)]);
+            self.local_scope.push(first);
+        }
+    }
+}
+
+// There is a false positive in using `Vec<ScalarExpr>` as key
+#[allow(clippy::mutable_key_type)]
+impl<'referred, 'df> Context<'referred, 'df> {
+    /// Interpret and execute plan
+    ///
+    /// return the output of this plan
+    pub fn render_plan(&mut self, plan: Plan) -> Result<CollectionBundle, Error> {
+        match plan {
+            Plan::Constant { rows } => Ok(self.render_constant(rows)),
+            Plan::Get { id } => self.get_by_id(id),
+            Plan::Let { id, value, body } => self.eval_let(id, value, body),
+            Plan::Mfp { input, mfp } => {
+                self.render_map_filter_project_into_executable_dataflow(input, mfp)
+            }
+            Plan::Reduce { .. } => todo!(),
+            Plan::Join { .. } => todo!(),
+            Plan::Union { .. } => todo!(),
+        }
+    }
+
+    /// render Constant, will only emit the `rows` once.
+    pub fn render_constant(&mut self, mut rows: Vec<DiffRow>) -> CollectionBundle {
+        let (send_port, recv_port) = self.df.make_edge::<_, Toff>("constant");
+
+        self.df
+            .add_subgraph_source("Constant", send_port, move |_ctx, send_port| {
+                if rows.is_empty() {
+                    return;
+                }
+                send_port.give(std::mem::take(&mut rows));
+            });
+
+        CollectionBundle::from_collection(Collection::from_port(recv_port))
+    }
+
+    pub fn get_by_id(&mut self, id: expr::Id) -> Result<CollectionBundle, Error> {
+        let ret = match id {
+            expr::Id::Local(local) => {
+                let bundle = self
+                    .local_scope
+                    .iter()
+                    .rev()
+                    .find_map(|scope| scope.get(&local))
+                    .with_context(|| InvalidQuerySnafu {
+                        reason: format!("Local variable {:?} not found", local),
+                    })?;
+                bundle.clone(self.df)
+            }
+            expr::Id::Global(id) => {
+                let bundle = self
+                    .input_collection
+                    .get(&id)
+                    .with_context(|| InvalidQuerySnafu {
+                        reason: format!("Collection {:?} not found", id),
+                    })?;
+                bundle.clone(self.df)
+            }
+        };
+        Ok(ret)
+    }
+
+    /// Eval `Let` operator, useful for assigning a value to a local variable
+    pub fn eval_let(
+        &mut self,
+        id: LocalId,
+        value: Box<Plan>,
+        body: Box<Plan>,
+    ) -> Result<CollectionBundle, Error> {
+        let value = self.render_plan(*value)?;
+
+        self.local_scope.push(Default::default());
+        self.insert_local(id, value);
+        let ret = self.render_plan(*body)?;
+        Ok(ret)
+    }
+
+    /// render MapFilterProject, will only emit the `rows` once. Assume all incoming row's sys time being `now`` and ignore the row's stated sys time
+    /// TODO(discord9): schedule mfp operator to run when temporal filter need
+    ///
+    /// `MapFilterProject`(`mfp` for short) is scheduled to run when there is enough amount of input updates
+    /// ***or*** when a future update in it's output buffer(a `Arrangement`) is supposed to emit now.
+    pub fn render_map_filter_project_into_executable_dataflow(
+        &mut self,
+        input: Box<Plan>,
+        mfp: MapFilterProject,
+    ) -> Result<CollectionBundle, Error> {
+        let input = self.render_plan(*input)?;
+        // TODO(discord9): consider if check if contain temporal to determine if
+        // need arrange or not, or does this added complexity worth it
+        let (out_send_port, out_recv_port) = self.df.make_edge::<_, Toff>("mfp");
+        let input_arity = mfp.input_arity;
+
+        // default to have a arrange with only future updates, so it can be empty if no temporal filter is applied
+        // as stream only sends current updates and etc.
+        let arrange = Arrangement::new();
+        let arrange_handler = ArrangeHandler::from(arrange.clone());
+        let arrange_handler_inner = ArrangeHandler::from(arrange);
+
+        // This closure capture following variables:
+        let mfp_plan = MfpPlan::create_from(mfp)?;
+        let now = self.compute_state.current_time_ref();
+
+        let err_collector = self.err_collector.clone();
+
+        // TODO(discord9): better way to schedule future run
+        let scheduler = self.compute_state.get_scheduler();
+        let scheduler_inner = scheduler.clone();
+
+        let subgraph = self.df.add_subgraph_in_out(
+            "mfp",
+            input.collection.into_inner(),
+            out_send_port,
+            move |_ctx, recv, send| {
+                // mfp only need to passively receive updates from recvs
+                let data = recv.take_inner().into_iter().flat_map(|v| v.into_iter());
+
+                mfp_subgraph(
+                    &arrange_handler_inner,
+                    data,
+                    &mfp_plan,
+                    *now.borrow(),
+                    &err_collector,
+                    &scheduler_inner,
+                    send,
+                );
+            },
+        );
+
+        // register current subgraph in scheduler for future scheduling
+        scheduler.set_cur_subgraph(subgraph);
+
+        let arranged = BTreeMap::from([(
+            (0..input_arity).map(ScalarExpr::Column).collect_vec(),
+            Arranged::new(arrange_handler),
+        )]);
+
+        let bundle = CollectionBundle {
+            collection: Collection::from_port(out_recv_port),
+            arranged,
+        };
+        Ok(bundle)
+    }
+}
+
+fn mfp_subgraph(
+    arrange: &ArrangeHandler,
+    input: impl IntoIterator<Item = DiffRow>,
+    mfp_plan: &MfpPlan,
+    now: repr::Timestamp,
+    err_collector: &ErrCollector,
+    scheduler: &Scheduler,
+    send: &PortCtx<SEND, Toff>,
+) {
+    let run_mfp = || {
+        let all_updates = eval_mfp_core(input, mfp_plan, now, err_collector);
+        arrange.write().apply_updates(now, all_updates)?;
+        Ok(())
+    };
+    err_collector.run(run_mfp);
+
+    // Deal with output:
+    // 1. Read all updates that were emitted between the last time this arrangement had updates and the current time.
+    // 2. Output the updates.
+    // 3. Truncate all updates within that range.
+
+    let from = arrange.read().last_compaction_time().map(|n| n + 1);
+    let from = from.unwrap_or(repr::Timestamp::MIN);
+    let output_kv = arrange.read().get_updates_in_range(from..=now);
+    // the output is expected to be key -> empty val
+    let output = output_kv
+        .into_iter()
+        .map(|((key, _v), ts, diff)| (key, ts, diff))
+        .collect_vec();
+    send.give(output);
+    let run_compaction = || {
+        arrange.write().compaction_to(now)?;
+        Ok(())
+    };
+    err_collector.run(run_compaction);
+
+    // schedule the next time this operator should run
+    if let Some(i) = arrange.read().get_next_update_time(&now) {
+        scheduler.schedule_at(i)
+    }
+}
+
+/// The core of evaluating MFP operator, given a MFP and a input, evaluate the MFP operator,
+/// return the output updates **And** possibly any number of errors that occurred during the evaluation
+fn eval_mfp_core(
+    input: impl IntoIterator<Item = DiffRow>,
+    mfp_plan: &MfpPlan,
+    now: repr::Timestamp,
+    err_collector: &ErrCollector,
+) -> Vec<KeyValDiffRow> {
+    let mut all_updates = Vec::new();
+    for (mut row, _sys_time, diff) in input.into_iter() {
+        // this updates is expected to be only zero to two rows
+        let updates = mfp_plan.evaluate::<EvalError>(&mut row.inner, now, diff);
+        // TODO(discord9): refactor error handling
+        // Expect error in a single row to not interrupt the whole evaluation
+        let updates = updates
+            .filter_map(|r| match r {
+                Ok((key, ts, diff)) => Some(((key, Row::empty()), ts, diff)),
+                Err((err, _ts, _diff)) => {
+                    err_collector.push_err(err);
+                    None
+                }
+            })
+            .collect_vec();
+
+        all_updates.extend(updates);
+    }
+    all_updates
+}
+
+#[cfg(test)]
+mod test {
+    use std::cell::RefCell;
+    use std::rc::Rc;
+
+    use common_time::DateTime;
+    use datatypes::data_type::ConcreteDataType;
+    use hydroflow::scheduled::graph::Hydroflow;
+    use hydroflow::scheduled::graph_ext::GraphExt;
+    use hydroflow::scheduled::handoff::VecHandoff;
+
+    use super::*;
+    use crate::expr::BinaryFunc;
+    use crate::repr::Row;
+
+    fn harness_test_ctx<'r, 'h>(
+        df: &'r mut Hydroflow<'h>,
+        state: &'r mut DataflowState,
+    ) -> Context<'r, 'h> {
+        let err_collector = state.get_err_collector();
+        Context {
+            id: GlobalId::User(0),
+            df,
+            compute_state: state,
+            input_collection: BTreeMap::new(),
+            local_scope: Default::default(),
+            err_collector,
+        }
+    }
+
+    /// test if temporal filter works properly
+    /// namely: if mfp operator can schedule a delete at the correct time
+    #[test]
+    fn test_render_mfp_with_temporal() {
+        let mut df = Hydroflow::new();
+        let mut state = DataflowState::default();
+        let mut ctx = harness_test_ctx(&mut df, &mut state);
+
+        let rows = vec![
+            (Row::new(vec![1i64.into()]), 1, 1),
+            (Row::new(vec![2i64.into()]), 2, 1),
+            (Row::new(vec![3i64.into()]), 3, 1),
+        ];
+        let collection = ctx.render_constant(rows);
+        ctx.insert_global(GlobalId::User(1), collection);
+        let input_plan = Plan::Get {
+            id: expr::Id::Global(GlobalId::User(1)),
+        };
+        // temporal filter: now <= col(0) < now + 4
+        let mfp = MapFilterProject::new(1)
+            .filter(vec![
+                ScalarExpr::Column(0)
+                    .call_unary(expr::UnaryFunc::Cast(ConcreteDataType::datetime_datatype()))
+                    .call_binary(
+                        ScalarExpr::CallUnmaterializable(expr::UnmaterializableFunc::Now),
+                        BinaryFunc::Gte,
+                    ),
+                ScalarExpr::Column(0)
+                    .call_binary(
+                        ScalarExpr::literal(4i64.into(), ConcreteDataType::int64_datatype()),
+                        BinaryFunc::SubInt64,
+                    )
+                    .call_unary(expr::UnaryFunc::Cast(ConcreteDataType::datetime_datatype()))
+                    .call_binary(
+                        ScalarExpr::CallUnmaterializable(expr::UnmaterializableFunc::Now),
+                        BinaryFunc::Lt,
+                    ),
+            ])
+            .unwrap();
+
+        let mut bundle = ctx
+            .render_map_filter_project_into_executable_dataflow(Box::new(input_plan), mfp)
+            .unwrap();
+        let collection = bundle.collection;
+        let _arranged = bundle.arranged.pop_first().unwrap().1;
+        let output = Rc::new(RefCell::new(vec![]));
+        let output_inner = output.clone();
+        let _subgraph = ctx.df.add_subgraph_sink(
+            "test_render_constant",
+            collection.into_inner(),
+            move |_ctx, recv| {
+                let data = recv.take_inner();
+                let res = data.into_iter().flat_map(|v| v.into_iter()).collect_vec();
+                output_inner.borrow_mut().clear();
+                output_inner.borrow_mut().extend(res);
+            },
+        );
+        // drop ctx here to simulate actual process of compile first, run later scenario
+        drop(ctx);
+        // expected output at given time
+        let expected_output = BTreeMap::from([
+            (
+                0, // time
+                vec![
+                    (Row::new(vec![1i64.into()]), 0, 1),
+                    (Row::new(vec![2i64.into()]), 0, 1),
+                    (Row::new(vec![3i64.into()]), 0, 1),
+                ],
+            ),
+            (
+                2, // time
+                vec![(Row::new(vec![1i64.into()]), 2, -1)],
+            ),
+            (
+                3, // time
+                vec![(Row::new(vec![2i64.into()]), 3, -1)],
+            ),
+            (
+                4, // time
+                vec![(Row::new(vec![3i64.into()]), 4, -1)],
+            ),
+        ]);
+
+        for now in 0i64..5 {
+            state.set_current_ts(now);
+            state.run_available_with_schedule(&mut df);
+            assert!(state.get_err_collector().inner.borrow().is_empty());
+            if let Some(expected) = expected_output.get(&now) {
+                assert_eq!(*output.borrow(), *expected);
+            } else {
+                assert_eq!(*output.borrow(), vec![]);
+            };
+            output.borrow_mut().clear();
+        }
+    }
+
+    /// test if mfp operator without temporal filter works properly
+    /// that is it filter the rows correctly
+    #[test]
+    fn test_render_mfp() {
+        let mut df = Hydroflow::new();
+        let mut state = DataflowState::default();
+        let mut ctx = harness_test_ctx(&mut df, &mut state);
+
+        let rows = vec![
+            (Row::new(vec![1.into()]), 1, 1),
+            (Row::new(vec![2.into()]), 2, 1),
+            (Row::new(vec![3.into()]), 3, 1),
+        ];
+        let collection = ctx.render_constant(rows);
+        ctx.insert_global(GlobalId::User(1), collection);
+        let input_plan = Plan::Get {
+            id: expr::Id::Global(GlobalId::User(1)),
+        };
+        // filter: col(0)>1
+        let mfp = MapFilterProject::new(1)
+            .filter(vec![ScalarExpr::Column(0).call_binary(
+                ScalarExpr::literal(1.into(), ConcreteDataType::int32_datatype()),
+                BinaryFunc::Gt,
+            )])
+            .unwrap();
+        let bundle = ctx
+            .render_map_filter_project_into_executable_dataflow(Box::new(input_plan), mfp)
+            .unwrap();
+        let collection = bundle.collection.clone(ctx.df);
+
+        ctx.df.add_subgraph_sink(
+            "test_render_constant",
+            collection.into_inner(),
+            move |_ctx, recv| {
+                let data = recv.take_inner();
+                let res = data.into_iter().flat_map(|v| v.into_iter()).collect_vec();
+                assert_eq!(
+                    res,
+                    vec![
+                        (Row::new(vec![2.into()]), 0, 1),
+                        (Row::new(vec![3.into()]), 0, 1),
+                    ]
+                )
+            },
+        );
+        drop(ctx);
+
+        df.run_available();
+    }
+
+    /// test if constant operator works properly
+    /// that is it only emit once, not multiple times
+    #[test]
+    fn test_render_constant() {
+        let mut df = Hydroflow::new();
+        let mut state = DataflowState::default();
+        let mut ctx = harness_test_ctx(&mut df, &mut state);
+
+        let rows = vec![
+            (Row::empty(), 1, 1),
+            (Row::empty(), 2, 1),
+            (Row::empty(), 3, 1),
+        ];
+        let collection = ctx.render_constant(rows);
+        let collection = collection.collection.clone(ctx.df);
+        let cnt = Rc::new(RefCell::new(0));
+        let cnt_inner = cnt.clone();
+        ctx.df.add_subgraph_sink(
+            "test_render_constant",
+            collection.into_inner(),
+            move |_ctx, recv| {
+                let data = recv.take_inner();
+                *cnt_inner.borrow_mut() += data.iter().map(|v| v.len()).sum::<usize>();
+            },
+        );
+        ctx.df.run_available();
+        assert_eq!(*cnt.borrow(), 3);
+        ctx.df.run_available();
+        assert_eq!(*cnt.borrow(), 3);
+    }
+
+    /// a simple example to show how to use source and sink
+    #[test]
+    fn example_source_sink() {
+        let mut df = Hydroflow::new();
+        let (send_port, recv_port) = df.make_edge::<_, VecHandoff<i32>>("test_handoff");
+        df.add_subgraph_source("test_handoff_source", send_port, move |_ctx, send| {
+            for i in 0..10 {
+                send.give(vec![i]);
+            }
+        });
+
+        let sum = Rc::new(RefCell::new(0));
+        let sum_move = sum.clone();
+        let sink = df.add_subgraph_sink("test_handoff_sink", recv_port, move |_ctx, recv| {
+            let data = recv.take_inner();
+            *sum_move.borrow_mut() += data.iter().sum::<i32>();
+        });
+
+        df.run_available();
+        assert_eq!(sum.borrow().to_owned(), 45);
+        df.schedule_subgraph(sink);
+        df.run_available();
+
+        assert_eq!(sum.borrow().to_owned(), 45);
+    }
+}
--- a/src/flow/src/compute/state.rs
+++ b/src/flow/src/compute/state.rs
@@ -0,0 +1,106 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::cell::RefCell;
+use std::collections::{BTreeMap, BTreeSet, VecDeque};
+use std::rc::Rc;
+
+use hydroflow::scheduled::graph::Hydroflow;
+use hydroflow::scheduled::SubgraphId;
+
+use crate::compute::types::ErrCollector;
+use crate::repr::{self, Timestamp};
+
+/// input/output of a dataflow
+/// One `ComputeState` manage the input/output/schedule of one `Hydroflow`
+#[derive(Default)]
+pub struct DataflowState {
+    /// it is important to use a deque to maintain the order of subgraph here
+    /// TODO(discord9): consider dedup? Also not necessary for hydroflow itself also do dedup when schedule
+    schedule_subgraph: Rc<RefCell<BTreeMap<Timestamp, VecDeque<SubgraphId>>>>,
+    /// Frontier (in sys time) before which updates should not be emitted.
+    ///
+    /// We *must* apply it to sinks, to ensure correct outputs.
+    /// We *should* apply it to sources and imported shared state, because it improves performance.
+    /// Which means it's also the current time in temporal filter to get current correct result
+    as_of: Rc<RefCell<Timestamp>>,
+    /// error collector local to this `ComputeState`,
+    /// useful for distinguishing errors from different `Hydroflow`
+    err_collector: ErrCollector,
+}
+
+impl DataflowState {
+    /// schedule all subgraph that need to run with time <= `as_of` and run_available()
+    ///
+    /// return true if any subgraph actually executed
+    pub fn run_available_with_schedule(&mut self, df: &mut Hydroflow) -> bool {
+        // first split keys <= as_of into another map
+        let mut before = self
+            .schedule_subgraph
+            .borrow_mut()
+            .split_off(&(*self.as_of.borrow() + 1));
+        std::mem::swap(&mut before, &mut self.schedule_subgraph.borrow_mut());
+        for (_, v) in before {
+            for subgraph in v {
+                df.schedule_subgraph(subgraph);
+            }
+        }
+        df.run_available()
+    }
+    pub fn get_scheduler(&self) -> Scheduler {
+        Scheduler {
+            schedule_subgraph: self.schedule_subgraph.clone(),
+            cur_subgraph: Rc::new(RefCell::new(None)),
+        }
+    }
+
+    /// return a handle to the current time, will update when `as_of` is updated
+    ///
+    /// so it can keep track of the current time even in a closure that is called later
+    pub fn current_time_ref(&self) -> Rc<RefCell<Timestamp>> {
+        self.as_of.clone()
+    }
+
+    pub fn current_ts(&self) -> Timestamp {
+        *self.as_of.borrow()
+    }
+
+    pub fn set_current_ts(&mut self, ts: Timestamp) {
+        self.as_of.replace(ts);
+    }
+
+    pub fn get_err_collector(&self) -> ErrCollector {
+        self.err_collector.clone()
+    }
+}
+
+#[derive(Clone)]
+pub struct Scheduler {
+    schedule_subgraph: Rc<RefCell<BTreeMap<Timestamp, VecDeque<SubgraphId>>>>,
+    cur_subgraph: Rc<RefCell<Option<SubgraphId>>>,
+}
+
+impl Scheduler {
+    pub fn schedule_at(&self, next_run_time: Timestamp) {
+        let mut schedule_subgraph = self.schedule_subgraph.borrow_mut();
+        let subgraph = self.cur_subgraph.borrow();
+        let subgraph = subgraph.as_ref().expect("Set SubgraphId before schedule");
+        let subgraph_queue = schedule_subgraph.entry(next_run_time).or_default();
+        subgraph_queue.push_back(*subgraph);
+    }
+
+    pub fn set_cur_subgraph(&self, subgraph: SubgraphId) {
+        self.cur_subgraph.replace(Some(subgraph));
+    }
+}
--- a/src/flow/src/compute/types.rs
+++ b/src/flow/src/compute/types.rs
@@ -0,0 +1,162 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::cell::RefCell;
+use std::collections::{BTreeMap, VecDeque};
+use std::rc::Rc;
+use std::sync::Arc;
+
+use hydroflow::scheduled::graph::Hydroflow;
+use hydroflow::scheduled::handoff::TeeingHandoff;
+use hydroflow::scheduled::port::RecvPort;
+use hydroflow::scheduled::SubgraphId;
+use tokio::sync::RwLock;
+
+use crate::compute::render::Context;
+use crate::expr::{EvalError, ScalarExpr};
+use crate::repr::DiffRow;
+use crate::utils::{ArrangeHandler, Arrangement};
+
+pub type Toff = TeeingHandoff<DiffRow>;
+
+/// A collection, represent a collections of data that is received from a handoff.
+pub struct Collection<T: 'static> {
+    /// represent a stream of updates recv from this port
+    stream: RecvPort<TeeingHandoff<T>>,
+}
+
+impl<T: 'static + Clone> Collection<T> {
+    pub fn from_port(port: RecvPort<TeeingHandoff<T>>) -> Self {
+        Collection { stream: port }
+    }
+
+    /// clone a collection, require a mutable reference to the hydroflow instance
+    ///
+    /// Note: need to be the same hydroflow instance that this collection is created from
+    pub fn clone(&self, df: &mut Hydroflow) -> Self {
+        Collection {
+            stream: self.stream.tee(df),
+        }
+    }
+
+    pub fn into_inner(self) -> RecvPort<TeeingHandoff<T>> {
+        self.stream
+    }
+}
+
+/// Arranged is a wrapper around `ArrangeHandler` that maintain a list of readers and a writer
+pub struct Arranged {
+    pub arrangement: ArrangeHandler,
+    pub writer: Rc<RefCell<Option<SubgraphId>>>,
+    /// maintain a list of readers for the arrangement for the ease of scheduling
+    pub readers: Rc<RefCell<Vec<SubgraphId>>>,
+}
+
+impl Arranged {
+    pub fn new(arr: ArrangeHandler) -> Self {
+        Self {
+            arrangement: arr,
+            writer: Default::default(),
+            readers: Default::default(),
+        }
+    }
+
+    /// Copy it's future only updates, internally `Rc-ed` so it's cheap to copy
+    pub fn try_copy_future(&self) -> Option<Self> {
+        self.arrangement
+            .clone_future_only()
+            .map(|arrangement| Arranged {
+                arrangement,
+                readers: self.readers.clone(),
+                writer: self.writer.clone(),
+            })
+    }
+
+    /// Copy the full arrangement, including the future and the current updates.
+    ///
+    /// Internally `Rc-ed` so it's cheap to copy
+    pub fn try_copy_full(&self) -> Option<Self> {
+        self.arrangement
+            .clone_full_arrange()
+            .map(|arrangement| Arranged {
+                arrangement,
+                readers: self.readers.clone(),
+                writer: self.writer.clone(),
+            })
+    }
+    pub fn add_reader(&self, id: SubgraphId) {
+        self.readers.borrow_mut().push(id)
+    }
+}
+
+/// A bundle of the various ways a collection can be represented.
+///
+/// This type maintains the invariant that it does contain at least one(or both) valid
+/// source of data, either a collection or at least one arrangement. This is for convenience
+/// of reading the data from the collection.
+pub struct CollectionBundle {
+    /// This is useful for passively reading the new updates from the collection
+    pub collection: Collection<DiffRow>,
+    /// the key [`ScalarExpr`] indicate how the keys(also a [`Row`]) used in Arranged is extract from collection's [`Row`]
+    /// So it is the "index" of the arrangement
+    ///
+    /// The `Arranged` is the actual data source, it can be used to read the data from the collection by
+    /// using the key indicated by the `Vec<ScalarExpr>`
+    pub arranged: BTreeMap<Vec<ScalarExpr>, Arranged>,
+}
+
+impl CollectionBundle {
+    pub fn from_collection(collection: Collection<DiffRow>) -> Self {
+        Self {
+            collection,
+            arranged: BTreeMap::default(),
+        }
+    }
+    pub fn clone(&self, df: &mut Hydroflow) -> Self {
+        Self {
+            collection: self.collection.clone(df),
+            arranged: self
+                .arranged
+                .iter()
+                .map(|(k, v)| (k.clone(), v.try_copy_future().unwrap()))
+                .collect(),
+        }
+    }
+}
+
+/// A thread local error collector, used to collect errors during the evaluation of the plan
+///
+/// usually only the first error matters, but store all of them just in case
+///
+/// Using a `VecDeque` to preserve the order of errors
+/// when running dataflow continuously and need errors in order
+#[derive(Default, Clone)]
+pub struct ErrCollector {
+    pub inner: Rc<RefCell<VecDeque<EvalError>>>,
+}
+
+impl ErrCollector {
+    pub fn push_err(&self, err: EvalError) {
+        self.inner.borrow_mut().push_back(err)
+    }
+
+    pub fn run<F>(&self, f: F)
+    where
+        F: FnOnce() -> Result<(), EvalError>,
+    {
+        if let Err(e) = f() {
+            self.push_err(e)
+        }
+    }
+}
--- a/src/flow/src/expr.rs
+++ b/src/flow/src/expr.rs
@@ -20,10 +20,11 @@ mod id;
 mod linear;
 mod relation;
 mod scalar;
+mod signature;

 pub(crate) use error::{EvalError, InvalidArgumentSnafu, OptimizeSnafu};
 pub(crate) use func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
 pub(crate) use id::{GlobalId, Id, LocalId};
 pub(crate) use linear::{MapFilterProject, MfpPlan, SafeMfpPlan};
 pub(crate) use relation::{AggregateExpr, AggregateFunc};
-pub(crate) use scalar::ScalarExpr;
+pub(crate) use scalar::{ScalarExpr, TypedExpr};
--- a/src/flow/src/expr/error.rs
+++ b/src/flow/src/expr/error.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! Error handling for expression evaluation.
+
 use std::any::Any;

 use common_macro::stack_trace_debug;
@@ -59,9 +61,6 @@ pub enum EvalError {
    #[snafu(display("Optimize error: {reason}"))]
    Optimize { reason: String, location: Location },

-    #[snafu(display("Unsupported temporal filter: {reason}"))]
-    UnsupportedTemporalFilter { reason: String, location: Location },
-
    #[snafu(display("Overflowed during evaluation"))]
    Overflow { location: Location },
 }
--- a/src/flow/src/expr/func.rs
+++ b/src/flow/src/expr/func.rs
@@ -12,21 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! This module contains the definition of functions that can be used in expressions.
+
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
 use common_time::DateTime;
+use datafusion_expr::Operator;
+use datafusion_substrait::logical_plan::consumer::name_to_op;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::types::cast;
 use datatypes::types::cast::CastOption;
 use datatypes::value::Value;
-use hydroflow::bincode::Error;
 use serde::{Deserialize, Serialize};
-use snafu::ResultExt;
+use smallvec::smallvec;
+use snafu::{ensure, OptionExt, ResultExt};
+use strum::{EnumIter, IntoEnumIterator};

+use crate::adapter::error::{Error, InvalidQuerySnafu, PlanSnafu};
 use crate::expr::error::{
    CastValueSnafu, DivisionByZeroSnafu, EvalError, InternalSnafu, TryFromValueSnafu,
    TypeMismatchSnafu,
 };
+use crate::expr::signature::{GenericFn, Signature};
 use crate::expr::{InvalidArgumentSnafu, ScalarExpr};
-use crate::repr::Row;
+use crate::repr::{value_to_internal_ts, Row};

 /// UnmaterializableFunc is a function that can't be eval independently,
 /// and require special handling
@@ -36,6 +46,38 @@ pub enum UnmaterializableFunc {
    CurrentSchema,
 }

+impl UnmaterializableFunc {
+    /// Return the signature of the function
+    pub fn signature(&self) -> Signature {
+        match self {
+            Self::Now => Signature {
+                input: smallvec![],
+                output: ConcreteDataType::datetime_datatype(),
+                generic_fn: GenericFn::Now,
+            },
+            Self::CurrentSchema => Signature {
+                input: smallvec![],
+                output: ConcreteDataType::string_datatype(),
+                generic_fn: GenericFn::CurrentSchema,
+            },
+        }
+    }
+
+    /// Create a UnmaterializableFunc from a string of the function name
+    pub fn from_str(name: &str) -> Result<Self, Error> {
+        match name {
+            "now" => Ok(Self::Now),
+            "current_schema" => Ok(Self::CurrentSchema),
+            _ => InvalidQuerySnafu {
+                reason: format!("Unknown unmaterializable function: {}", name),
+            }
+            .fail(),
+        }
+    }
+}
+
+/// UnaryFunc is a function that takes one argument. Also notice this enum doesn't contain function arguments,
+/// because the arguments are stored in the expression. (except `cast` function, which requires a type argument)
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
 pub enum UnaryFunc {
    Not,
@@ -47,6 +89,68 @@ pub enum UnaryFunc {
 }

 impl UnaryFunc {
+    /// Return the signature of the function
+    pub fn signature(&self) -> Signature {
+        match self {
+            Self::IsNull => Signature {
+                input: smallvec![ConcreteDataType::null_datatype()],
+                output: ConcreteDataType::boolean_datatype(),
+                generic_fn: GenericFn::IsNull,
+            },
+            Self::Not | Self::IsTrue | Self::IsFalse => Signature {
+                input: smallvec![ConcreteDataType::boolean_datatype()],
+                output: ConcreteDataType::boolean_datatype(),
+                generic_fn: match self {
+                    Self::Not => GenericFn::Not,
+                    Self::IsTrue => GenericFn::IsTrue,
+                    Self::IsFalse => GenericFn::IsFalse,
+                    _ => unreachable!(),
+                },
+            },
+            Self::StepTimestamp => Signature {
+                input: smallvec![ConcreteDataType::datetime_datatype()],
+                output: ConcreteDataType::datetime_datatype(),
+                generic_fn: GenericFn::StepTimestamp,
+            },
+            Self::Cast(to) => Signature {
+                input: smallvec![ConcreteDataType::null_datatype()],
+                output: to.clone(),
+                generic_fn: GenericFn::Cast,
+            },
+        }
+    }
+
+    /// Create a UnaryFunc from a string of the function name and given argument type(optional)
+    pub fn from_str_and_type(
+        name: &str,
+        arg_type: Option<ConcreteDataType>,
+    ) -> Result<Self, Error> {
+        match name {
+            "not" => Ok(Self::Not),
+            "is_null" => Ok(Self::IsNull),
+            "is_true" => Ok(Self::IsTrue),
+            "is_false" => Ok(Self::IsFalse),
+            "step_timestamp" => Ok(Self::StepTimestamp),
+            "cast" => {
+                let arg_type = arg_type.with_context(|| InvalidQuerySnafu {
+                    reason: "cast function requires a type argument".to_string(),
+                })?;
+                Ok(UnaryFunc::Cast(arg_type))
+            }
+            _ => InvalidQuerySnafu {
+                reason: format!("Unknown unary function: {}", name),
+            }
+            .fail(),
+        }
+    }
+
+    /// Evaluate the function with given values and expression
+    ///
+    /// # Arguments
+    ///
+    /// - `values`: The values to be used in the evaluation
+    ///
+    /// - `expr`: The expression to be evaluated and use as argument, will extract the value from the `values` and evaluate the expression
    pub fn eval(&self, values: &[Value], expr: &ScalarExpr) -> Result<Value, EvalError> {
        let arg = expr.eval(values)?;
        match self {
@@ -80,13 +184,17 @@ impl UnaryFunc {
                }
            }
            Self::StepTimestamp => {
+                let ty = arg.data_type();
                if let Value::DateTime(datetime) = arg {
                    let datetime = DateTime::from(datetime.val() + 1);
                    Ok(Value::from(datetime))
+                } else if let Ok(v) = value_to_internal_ts(arg) {
+                    let datetime = DateTime::from(v + 1);
+                    Ok(Value::from(datetime))
                } else {
                    TypeMismatchSnafu {
                        expected: ConcreteDataType::datetime_datatype(),
-                        actual: arg.data_type(),
+                        actual: ty,
                    }
                    .fail()?
                }
@@ -105,8 +213,13 @@ impl UnaryFunc {
    }
 }

+/// BinaryFunc is a function that takes two arguments.
+/// Also notice this enum doesn't contain function arguments, since the arguments are stored in the expression.
+///
 /// TODO(discord9): support more binary functions for more types
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash, EnumIter,
+)]
 pub enum BinaryFunc {
    Eq,
    NotEq,
@@ -154,7 +267,257 @@ pub enum BinaryFunc {
    ModUInt64,
 }

+/// Generate binary function signature based on the function and the input types
+/// The user can provide custom signature for some functions in the form of a regular match arm,
+/// and the rest will be generated according to the provided list of functions like this:
+/// ```ignore
+/// AddInt16=>(int16_datatype,Add),
+/// ```
+/// which expand to:
+/// ```ignore, rust
+/// Self::AddInt16 => Signature {
+///    input: smallvec![
+///       ConcreteDataType::int16_datatype(),
+///      ConcreteDataType::int16_datatype(),
+///    ],
+///    output: ConcreteDataType::int16_datatype(),
+///    generic_fn: GenericFn::Add,
+/// },
+/// ````
+macro_rules! generate_binary_signature {
+    ($value:ident, { $($user_arm:tt)* },
+    [ $(
+        $auto_arm:ident=>($con_type:ident,$generic:ident)
+        ),*
+    ]) => {
+        match $value {
+            $($user_arm)*,
+            $(
+                Self::$auto_arm => Signature {
+                    input: smallvec![
+                        ConcreteDataType::$con_type(),
+                        ConcreteDataType::$con_type(),
+                    ],
+                    output: ConcreteDataType::$con_type(),
+                    generic_fn: GenericFn::$generic,
+                },
+            )*
+        }
+    };
+}
+
+static SPECIALIZATION: OnceLock<HashMap<(GenericFn, ConcreteDataType), BinaryFunc>> =
+    OnceLock::new();
+
 impl BinaryFunc {
+    /// Use null type to ref to any type
+    pub fn signature(&self) -> Signature {
+        generate_binary_signature!(self, {
+                Self::Eq | Self::NotEq | Self::Lt | Self::Lte | Self::Gt | Self::Gte => Signature {
+                    input: smallvec![
+                        ConcreteDataType::null_datatype(),
+                        ConcreteDataType::null_datatype()
+                    ],
+                    output: ConcreteDataType::boolean_datatype(),
+                    generic_fn: match self {
+                        Self::Eq => GenericFn::Eq,
+                        Self::NotEq => GenericFn::NotEq,
+                        Self::Lt => GenericFn::Lt,
+                        Self::Lte => GenericFn::Lte,
+                        Self::Gt => GenericFn::Gt,
+                        Self::Gte => GenericFn::Gte,
+                        _ => unreachable!(),
+                    },
+                }
+            },
+            [
+                AddInt16=>(int16_datatype,Add),
+                AddInt32=>(int32_datatype,Add),
+                AddInt64=>(int64_datatype,Add),
+                AddUInt16=>(uint16_datatype,Add),
+                AddUInt32=>(uint32_datatype,Add),
+                AddUInt64=>(uint64_datatype,Add),
+                AddFloat32=>(float32_datatype,Add),
+                AddFloat64=>(float64_datatype,Add),
+                SubInt16=>(int16_datatype,Sub),
+                SubInt32=>(int32_datatype,Sub),
+                SubInt64=>(int64_datatype,Sub),
+                SubUInt16=>(uint16_datatype,Sub),
+                SubUInt32=>(uint32_datatype,Sub),
+                SubUInt64=>(uint64_datatype,Sub),
+                SubFloat32=>(float32_datatype,Sub),
+                SubFloat64=>(float64_datatype,Sub),
+                MulInt16=>(int16_datatype,Mul),
+                MulInt32=>(int32_datatype,Mul),
+                MulInt64=>(int64_datatype,Mul),
+                MulUInt16=>(uint16_datatype,Mul),
+                MulUInt32=>(uint32_datatype,Mul),
+                MulUInt64=>(uint64_datatype,Mul),
+                MulFloat32=>(float32_datatype,Mul),
+                MulFloat64=>(float64_datatype,Mul),
+                DivInt16=>(int16_datatype,Div),
+                DivInt32=>(int32_datatype,Div),
+                DivInt64=>(int64_datatype,Div),
+                DivUInt16=>(uint16_datatype,Div),
+                DivUInt32=>(uint32_datatype,Div),
+                DivUInt64=>(uint64_datatype,Div),
+                DivFloat32=>(float32_datatype,Div),
+                DivFloat64=>(float64_datatype,Div),
+                ModInt16=>(int16_datatype,Mod),
+                ModInt32=>(int32_datatype,Mod),
+                ModInt64=>(int64_datatype,Mod),
+                ModUInt16=>(uint16_datatype,Mod),
+                ModUInt32=>(uint32_datatype,Mod),
+                ModUInt64=>(uint64_datatype,Mod)
+            ]
+        )
+    }
+
+    /// Get the specialization of the binary function based on the generic function and the input type
+    pub fn specialization(generic: GenericFn, input_type: ConcreteDataType) -> Result<Self, Error> {
+        let rule = SPECIALIZATION.get_or_init(|| {
+            let mut spec = HashMap::new();
+            for func in BinaryFunc::iter() {
+                let sig = func.signature();
+                spec.insert((sig.generic_fn, sig.input[0].clone()), func);
+            }
+            spec
+        });
+        rule.get(&(generic, input_type.clone()))
+            .cloned()
+            .with_context(|| InvalidQuerySnafu {
+                reason: format!(
+                    "No specialization found for binary function {:?} with input type {:?}",
+                    generic, input_type
+                ),
+            })
+    }
+
+    /// try it's best to infer types from the input types and expressions
+    ///
+    /// if it can't found out types, will return None
+    pub(crate) fn infer_type_from(
+        generic: GenericFn,
+        arg_exprs: &[ScalarExpr],
+        arg_types: &[Option<ConcreteDataType>],
+    ) -> Result<ConcreteDataType, Error> {
+        let ret = match (arg_types[0].as_ref(), arg_types[1].as_ref()) {
+            (Some(t1), Some(t2)) => {
+                ensure!(
+                    t1 == t2,
+                    InvalidQuerySnafu {
+                        reason: format!(
+                            "Binary function {:?} requires both arguments to have the same type",
+                            generic
+                        ),
+                    }
+                );
+                t1.clone()
+            }
+            (Some(t), None) | (None, Some(t)) => t.clone(),
+            _ => arg_exprs[0]
+                .as_literal()
+                .map(|lit| lit.data_type())
+                .or_else(|| arg_exprs[1].as_literal().map(|lit| lit.data_type()))
+                .with_context(|| InvalidQuerySnafu {
+                    reason: format!(
+                        "Binary function {:?} requires at least one argument with known type",
+                        generic
+                    ),
+                })?,
+        };
+        Ok(ret)
+    }
+
+    /// choose the appropriate specialization based on the input types
+    /// return a specialization of the binary function and it's actual input and output type(so no null type present)
+    ///
+    /// will try it best to extract from `arg_types` and `arg_exprs` to get the input types
+    /// if `arg_types` is not enough, it will try to extract from `arg_exprs` if `arg_exprs` is literal with known type
+    pub fn from_str_expr_and_type(
+        name: &str,
+        arg_exprs: &[ScalarExpr],
+        arg_types: &[Option<ConcreteDataType>],
+    ) -> Result<(Self, Signature), Error> {
+        // this `name_to_op` if error simply return a similar message of `unsupported function xxx` so
+        let op = name_to_op(name).or_else(|err| {
+            if let datafusion_common::DataFusionError::NotImplemented(msg) = err {
+                InvalidQuerySnafu {
+                    reason: format!("Unsupported binary function: {}", msg),
+                }
+                .fail()
+            } else {
+                InvalidQuerySnafu {
+                    reason: format!("Error when parsing binary function: {:?}", err),
+                }
+                .fail()
+            }
+        })?;
+
+        // get first arg type and make sure if both is some, they are the same
+        let generic_fn = {
+            match op {
+                Operator::Eq => GenericFn::Eq,
+                Operator::NotEq => GenericFn::NotEq,
+                Operator::Lt => GenericFn::Lt,
+                Operator::LtEq => GenericFn::Lte,
+                Operator::Gt => GenericFn::Gt,
+                Operator::GtEq => GenericFn::Gte,
+                Operator::Plus => GenericFn::Add,
+                Operator::Minus => GenericFn::Sub,
+                Operator::Multiply => GenericFn::Mul,
+                Operator::Divide => GenericFn::Div,
+                Operator::Modulo => GenericFn::Mod,
+                _ => {
+                    return InvalidQuerySnafu {
+                        reason: format!("Unsupported binary function: {}", name),
+                    }
+                    .fail();
+                }
+            }
+        };
+        let need_type = matches!(
+            generic_fn,
+            GenericFn::Add | GenericFn::Sub | GenericFn::Mul | GenericFn::Div | GenericFn::Mod
+        );
+
+        ensure!(
+            arg_exprs.len() == 2 && arg_types.len() == 2,
+            PlanSnafu {
+                reason: "Binary function requires exactly 2 arguments".to_string()
+            }
+        );
+
+        let arg_type = Self::infer_type_from(generic_fn, arg_exprs, arg_types)?;
+
+        // if type is not needed, we can erase input type to null to find correct functions for
+        // functions that do not need type
+        let query_input_type = if need_type {
+            arg_type.clone()
+        } else {
+            ConcreteDataType::null_datatype()
+        };
+
+        let spec_fn = Self::specialization(generic_fn, query_input_type)?;
+
+        let signature = Signature {
+            input: smallvec![arg_type.clone(), arg_type],
+            output: spec_fn.signature().output,
+            generic_fn,
+        };
+
+        Ok((spec_fn, signature))
+    }
+
+    /// Evaluate the function with given values and expression
+    ///
+    /// # Arguments
+    ///
+    /// - `values`: The values to be used in the evaluation
+    ///
+    /// - `expr1`: The first arg to be evaluated, will extract the value from the `values` and evaluate the expression
+    ///
+    /// - `expr2`: The second arg to be evaluated
    pub fn eval(
        &self,
        values: &[Value],
@@ -218,7 +581,7 @@ impl BinaryFunc {

    /// Reverse the comparison operator, i.e. `a < b` becomes `b > a`,
    /// equal and not equal are unchanged.
-    pub fn reverse_compare(&self) -> Result<Self, EvalError> {
+    pub fn reverse_compare(&self) -> Result<Self, Error> {
        let ret = match &self {
            BinaryFunc::Eq => BinaryFunc::Eq,
            BinaryFunc::NotEq => BinaryFunc::NotEq,
@@ -227,7 +590,7 @@ impl BinaryFunc {
            BinaryFunc::Gt => BinaryFunc::Lt,
            BinaryFunc::Gte => BinaryFunc::Lte,
            _ => {
-                return InternalSnafu {
+                return InvalidQuerySnafu {
                    reason: format!("Expect a comparison operator, found {:?}", self),
                }
                .fail();
@@ -237,13 +600,44 @@ impl BinaryFunc {
    }
 }

-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
+/// VariadicFunc is a function that takes a variable number of arguments.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
 pub enum VariadicFunc {
    And,
    Or,
 }

 impl VariadicFunc {
+    /// Return the signature of the function
+    pub fn signature(&self) -> Signature {
+        Signature {
+            input: smallvec![ConcreteDataType::boolean_datatype()],
+            output: ConcreteDataType::boolean_datatype(),
+            generic_fn: match self {
+                Self::And => GenericFn::And,
+                Self::Or => GenericFn::Or,
+            },
+        }
+    }
+
+    /// Create a VariadicFunc from a string of the function name and given argument types(optional)
+    pub fn from_str_and_types(
+        name: &str,
+        arg_types: &[Option<ConcreteDataType>],
+    ) -> Result<Self, Error> {
+        // TODO: future variadic funcs to be added might need to check arg_types
+        let _ = arg_types;
+        match name {
+            "and" => Ok(Self::And),
+            "or" => Ok(Self::Or),
+            _ => InvalidQuerySnafu {
+                reason: format!("Unknown variadic function: {}", name),
+            }
+            .fail(),
+        }
+    }
+
+    /// Evaluate the function with given values and expressions
    pub fn eval(&self, values: &[Value], exprs: &[ScalarExpr]) -> Result<Value, EvalError> {
        match self {
            VariadicFunc::And => and(values, exprs),
@@ -373,7 +767,7 @@ fn test_num_ops() {
    assert_eq!(res, Value::from(30));
    let res = div::<i32>(left.clone(), right.clone()).unwrap();
    assert_eq!(res, Value::from(3));
-    let res = rem::<i32>(left.clone(), right.clone()).unwrap();
+    let res = rem::<i32>(left, right).unwrap();
    assert_eq!(res, Value::from(1));

    let values = vec![Value::from(true), Value::from(false)];
@@ -383,3 +777,97 @@ fn test_num_ops() {
    let res = or(&values, &exprs).unwrap();
    assert_eq!(res, Value::from(true));
 }
+
+/// test if the binary function specialization works
+/// whether from direct type or from the expression that is literal
+#[test]
+fn test_binary_func_spec() {
+    assert_eq!(
+        BinaryFunc::from_str_expr_and_type(
+            "add",
+            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+            &[
+                Some(ConcreteDataType::int32_datatype()),
+                Some(ConcreteDataType::int32_datatype())
+            ]
+        )
+        .unwrap(),
+        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+    );
+
+    assert_eq!(
+        BinaryFunc::from_str_expr_and_type(
+            "add",
+            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+            &[Some(ConcreteDataType::int32_datatype()), None]
+        )
+        .unwrap(),
+        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+    );
+
+    assert_eq!(
+        BinaryFunc::from_str_expr_and_type(
+            "add",
+            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+            &[Some(ConcreteDataType::int32_datatype()), None]
+        )
+        .unwrap(),
+        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+    );
+
+    assert_eq!(
+        BinaryFunc::from_str_expr_and_type(
+            "add",
+            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+            &[Some(ConcreteDataType::int32_datatype()), None]
+        )
+        .unwrap(),
+        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+    );
+
+    assert_eq!(
+        BinaryFunc::from_str_expr_and_type(
+            "add",
+            &[
+                ScalarExpr::Literal(Value::from(1i32), ConcreteDataType::int32_datatype()),
+                ScalarExpr::Column(0)
+            ],
+            &[None, None]
+        )
+        .unwrap(),
+        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+    );
+
+    // this testcase make sure the specialization can find actual type from expression and fill in signature
+    assert_eq!(
+        BinaryFunc::from_str_expr_and_type(
+            "equal",
+            &[
+                ScalarExpr::Literal(Value::from(1i32), ConcreteDataType::int32_datatype()),
+                ScalarExpr::Column(0)
+            ],
+            &[None, None]
+        )
+        .unwrap(),
+        (
+            BinaryFunc::Eq,
+            Signature {
+                input: smallvec![
+                    ConcreteDataType::int32_datatype(),
+                    ConcreteDataType::int32_datatype()
+                ],
+                output: ConcreteDataType::boolean_datatype(),
+                generic_fn: GenericFn::Eq
+            }
+        )
+    );
+
+    matches!(
+        BinaryFunc::from_str_expr_and_type(
+            "add",
+            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+            &[None, None]
+        ),
+        Err(Error::InvalidQuery { .. })
+    );
+}
--- a/src/flow/src/expr/id.rs
+++ b/src/flow/src/expr/id.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! `Id` is used to identify a dataflow component in plan like `Plan::Get{id: Id}`, this could be a source of data for an arrangement.
+
 use serde::{Deserialize, Serialize};

 /// Global id's scope is in Current Worker, and is cross-dataflow
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
luofucong	ee67ce10c9	insert some rows and query them across DDL to test the integrity of data	2024-05-14 20:41:47 +08:00
WenyXu	2ba721cc82	fix: fix bug	2024-05-13 08:25:18 +00:00
WenyXu	de468ee595	fix: test program	2024-05-13 07:46:30 +00:00
WenyXu	bb9bdf74ec	feat: export metric endpoint	2024-05-13 02:44:34 +00:00
WenyXu	be5574fdb3	fuzz alter	2024-04-19 14:14:34 +00:00
WenyXu	f9afc5dbbf	feat: `start_database` auto retry	2024-04-19 02:40:15 +00:00
WenyXu	c7400a4182	chore: reduce sleep time	2024-04-18 13:59:20 +00:00
WenyXu	bf07dd275a	test: reproduce bugs	2024-04-18 13:48:11 +00:00
WenyXu	7e1eed4b18	feat: adapt for cuckoo	2024-04-16 14:52:06 +00:00
Weny Xu	d12379106e	feat(drop_table): support to rollback table metadata (#3692 ) * feat: support to rollback table metadata * refactor: store table route value instead of physical table route * feat(drop_table): support to rollback table metadata * test: add rollback tests for drop table * fix: do not set region to readonly * test: add sqlness tests * feat: implement TombstoneManager * test: add tests for TombstoneManager * refactor: using TombstoneManager * chore: remove unused code * fix: fix typo * refactor: using `on_restore_metadata` * refactor: add `executor` to `DropTableProcedure` * refactor: simplify the `TombstoneManager` * refactor: refactor `Key` * refactor: carry more info * feat: add `destroy_table_metadata` * refactor: remove redundant table_route_value * feat: ensure the key is empty * feat: introcude `table_metadata_keys` * chore: carry more info * chore: remove clone * chore: apply suggestions from CR * feat: delete metadata tombstone	2024-04-16 09:22:41 +00:00
Weny Xu	64941d848e	fix(alter_table): ignore request outdated error (#3715 )	2024-04-16 08:18:38 +00:00
Ruihang Xia	96a40e0300	feat: check partition rule (#3711 ) * feat: check partition rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy and fmt Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add more tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix typo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * correct test comment Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-04-16 08:13:49 +00:00
Yingwen	d2e081c1f9	docs: update memtable config example (#3712 )	2024-04-16 07:26:20 +00:00
tison	cdbdb04d93	refactor: remove redundant try_flush invocations (#3706 ) * refactor: remove redundant try_flush invocations Signed-off-by: tison <wander4096@gmail.com> * fixup Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com>	2024-04-16 06:35:55 +00:00
Lei, HUANG	5af87baeb0	feat: add `filter_deleted` option to avoid removing deletion markers (#3707 ) * feat: add `filter_deleted` scan option to avoid removing deletion markers. * refactor: move sort_batches_and_print to test_util	2024-04-16 06:34:41 +00:00
maco	d5a948a0a6	test: Add tests for KvBackend trait implement (#3700 ) * test: add etcd * optimize code * test: add etcd tests * fix: typos * fix: taplo error and clippy * avoid print Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: tison <wander4096@gmail.com>	2024-04-15 10:51:59 +00:00
Eugene Tolbakov	bbea651d08	feat(promql): parameterize lookback (#3630 ) * feat(promql): parameterize lookback * chore(promql): address CR, adjusted sqlness * chore(promql): fmt * chore(promql): fix accidental removal * fix(promql): address CR * fix(promql): address CR * feat(promql): add initial lookback parameter grpc support * fix: update greptime-proto revision * chore: restore accidental removal	2024-04-15 09:11:21 +00:00
zyy17	8060c81e1d	refactor: use toml2docs to generate config docs (#3704 ) * refactor: use toml2docs to generate config docs * ci: add docs check in 'check-typos-and-docs'	2024-04-15 09:08:32 +00:00
Jeremyhi	e6507aaf34	chore: debt 3696 (#3705 )	2024-04-15 09:02:19 +00:00
Jeremyhi	87795248dd	feat: get metasrv clusterinfo (#3696 ) * feat: add doc for MetasrvOptions * feat: register candidate before election * feat: get all peers metasrv * chore: simply code * chore: proto rev * Update src/common/meta/src/cluster.rs Co-authored-by: dennis zhuang <killme2008@gmail.com> * Update src/meta-client/src/client.rs Co-authored-by: dennis zhuang <killme2008@gmail.com> * fmt Signed-off-by: tison <wander4096@gmail.com> * Apply suggestions from code review Co-authored-by: dennis zhuang <killme2008@gmail.com> * impl<T: AsRef<[u8]>> From<T> for LeaderValue Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: dennis zhuang <killme2008@gmail.com> Co-authored-by: tison <wander4096@gmail.com>	2024-04-15 08:10:48 +00:00
irenjj	7a04bfe50a	feat: add strict mode to validate protocol strings (#3638 ) * feat: add strict mode to validate protocol strings * hotfix: fix test * fix: fix return pair and test param * test: add test for utf-8 validation * fix: cargo check * Update src/servers/src/prom_row_builder.rs Co-authored-by: Eugene Tolbakov <ev.tolbakov@gmail.com> * fix: fix param of without_strict_mode * fix: change field name in HttpOptions * fix: replace if else with match * fix: replace all strict_mode with is_stirct_mode * fix: fix test_config_api * fix: fix bench, add vm handshake, catch error --------- Co-authored-by: Eugene Tolbakov <ev.tolbakov@gmail.com> Co-authored-by: tison <wander4096@gmail.com>	2024-04-15 07:53:48 +00:00
Yingwen	2f4726f7b5	refactor: Move manifest manager lock to `MitoRegion` (#3689 ) * feat: remove manager inner wip * feat: put manifest lock in region * feat: don't update manifest if manager is stopped * chore: address CR comments	2024-04-15 05:48:25 +00:00
dennis zhuang	75d85f9915	feat: impl table_constraints table for information_schema (#3698 ) * feat: impl table_constraints table for information_schema * test: update information_schema sqlness test * test: adds table_constraints sqlness test	2024-04-15 03:59:16 +00:00
discord9	db329f6c80	feat(flow): transform substrait SELECT&WHERE&GROUP BY to Flow Plan (#3690 ) * feat: transofrm substrait SELECT&WHERE&GROUP BY to Flow Plan * chore: reexport from common/substrait * feat: use datafusion Aggr Func to map to Flow aggr func * chore: remove unwrap&split literal * refactor: split transform.rs into smaller files * feat: apply optimize for variadic fn * refactor: split unit test * chore: per review	2024-04-12 07:38:42 +00:00
Ning Sun	544c4a70f8	refactor: check error type before logging (#3697 ) * refactor: check error type before logging * chore: update log level for broken pipe * refactor: leave a debugging output for non critial error	2024-04-12 02:18:14 +00:00
dimbtp	02f806fba9	fix: cli export "create table" with quoted names (#3684 ) * fix: cli export `create table` with quoted names * add test * apply review comments * fix to pass check * remove eprintln for clippy check * use prebuilt binary to avoid compile * ci run coverage after build * drop dirty hack test Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: tison <wander4096@gmail.com>	2024-04-11 06:56:14 +00:00
tison	9459ace33e	ci: add CODEOWNERS file (#3691 ) Signed-off-by: tison <wander4096@gmail.com>	2024-04-10 17:47:54 +00:00
Weny Xu	c1e005b148	refactor: drop table procedure (#3688 ) * refactor: refactor drop table procedure * refactor: refactor test utils	2024-04-10 12:22:10 +00:00
discord9	c00c1d95ee	chore(flow): more comments&lint (#3680 ) * chore: more comments&lint * chore: per review * chore: remove abundant dep	2024-04-10 03:31:22 +00:00
tison	5d739932c0	chore: remove TODO that has been done (#3683 ) This TODO is done by https://github.com/GreptimeTeam/greptimedb/pull/3473.	2024-04-09 22:55:55 +00:00
Ruihang Xia	aab7367804	feat: try get pk values from cache when applying predicate to parquet (#3286 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: tison <wander4096@gmail.com>	2024-04-09 12:53:38 +00:00
Yohan Wal	34f935df66	chore: create database api change in protobuf (#3682 )	2024-04-09 12:11:38 +00:00
Weny Xu	fda1523ced	refactor: refactor alter table procedure (#3678 ) * refactor: refactor alter table procedure * chore: apply suggestions from CR * chore: remove `alter_expr` and `alter_kind`	2024-04-09 10:35:51 +00:00
tison	2c0c7759ee	feat: add checksum for checkpoint data (#3651 ) * feat: add checksum for checkpoint data Signed-off-by: tison <wander4096@gmail.com> * add test Signed-off-by: tison <wander4096@gmail.com> * clippy Signed-off-by: tison <wander4096@gmail.com> * fix: checksum should calculate on uncompressed data Signed-off-by: tison <wander4096@gmail.com> * address comments Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com>	2024-04-09 08:32:24 +00:00
Weny Xu	2398918adf	feat(fuzz): support to create metric table (#3617 ) Co-authored-by: tison <wander4096@gmail.com>	2024-04-09 06:00:04 +00:00
Ruihang Xia	50bea2f107	feat: treat all number types as field candidates (#3670 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-04-09 03:28:21 +00:00
JeremyHi	1629435888	chore: unify name metasrv (#3671 ) chore: unify name	2024-04-09 03:03:26 +00:00
tison	b3c94a303b	chore: add a fix-clippy Makefile target (#3677 ) * chore: add a fix-clippy Makefile target * Update Makefile	2024-04-09 02:59:55 +00:00
tison	883b7fce96	refactor: bundle the lightweight axum test client (#3669 ) * refactor: bundle the lightweight axum test client Signed-off-by: tison <wander4096@gmail.com> * address comments Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com>	2024-04-09 02:33:26 +00:00
discord9	ea9367f371	refactor(flow): func spec api&use Error not EvalError in mfp (#3657 ) * refactor: func's specialization& use Error not EvalError * docs: some pub item * chore: typo * docs: add comments for every pub item * chore: per review * chore: per reveiw&derive Copy * chore: per review&test for binary fn spec * docs: comment explain how binary func spec works * chore: minor style change * fix: Error not EvalError	2024-04-09 02:32:02 +00:00
tison	2896e1f868	refactor: pass http method to metasrv http handler (#3667 ) * refactor: pass http method to metasrc http handler Signed-off-by: tison <wander4096@gmail.com> * update maintenance endpoint Signed-off-by: tison <wander4096@gmail.com> * fixup Signed-off-by: tison <wander4096@gmail.com> * Update src/meta-srv/src/service/admin.rs Co-authored-by: dennis zhuang <killme2008@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: dennis zhuang <killme2008@gmail.com>	2024-04-09 02:26:42 +00:00
Lei, HUANG	183fccbbd6	chore: remove global_ttl config (#3673 ) * chore: remove global_ttl config * fix: clippy	2024-04-09 02:00:50 +00:00
Weny Xu	b51089fa61	fix: `DeserializedValueWithBytes::from_inner` misusing (#3676 ) * fix: fix `DeserializedValueWithBytes::from_inner` misusing * Update src/common/meta/src/key.rs --------- Co-authored-by: tison <wander4096@gmail.com>	2024-04-09 01:48:35 +00:00
Yohan Wal	682b04cbe4	feat(fuzz): add create database target (#3675 ) * feat(fuzz): add create database target * chore(ci): add fuzz_create_database ci cfg	2024-04-09 01:33:29 +00:00
tison	e1d2f9a596	chore: improve contributor click in git-cliff (#3672 ) Signed-off-by: tison <wander4096@gmail.com>	2024-04-08 18:15:00 +00:00
tison	2fca45b048	ci: setup-protoc always with token (#3674 ) Signed-off-by: tison <wander4096@gmail.com>	2024-04-08 18:13:24 +00:00
Yingwen	3e1a125732	feat: add append mode to table options (#3624 ) * feat: add append mode to table options * test: add append mode test * test: rename test tables * chore: Add delete test for append mode	2024-04-08 13:42:58 +00:00
Mofeng	34b1427a82	fix(readme): fix link of Ingester-js (#3668 )	2024-04-08 12:17:44 +00:00
discord9	28fd0dc276	feat(flow): render map&related tests (#3581 ) * feat: render map&related tests * chore: license header * chore: update Cargo.lock&remove unused * refactor: rename ComputeState to DataflowState * chore: use org fork * chore: fix typos * chore: per review * chore: more explain to use `VecDeque` in err collector * chore: typos * chore: more comment on `Plan::Let` * chore: typos * refactor mfp rendering Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix: update `now` in closure * feat: use insert_local * chore: remove unused * chore: per review * chore: fmt comment --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> Co-authored-by: Zhenchi <zhongzc_arch@outlook.com>	2024-04-08 11:36:07 +00:00
Weny Xu	32b9639d7c	feat(procedure): support to rollback (#3625 ) * feat: add rollback method * refactor: simplify the state control * feat(procedure): support to rollback * test: add tests for rollback * feat: persist rollback procedure state * feat: rollback procedure after restarting * feat: add `CommitRollback`, `RollingBack` to ProcedureStateResponse * chore: apply suggestions from CR * feat: persist rollback error * feat: add `is_support_rollback` * chore: apply suggestions from CR * chore: update greptime-proto * chore: rename to `rollback_supported` * chore: rename to `RollbackProcedureRecovered`	2024-04-08 11:23:23 +00:00