revert toml format

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Merge branch 'main' into fix-proto-clear
2026-01-03 20:02:54 +00:00 · 2024-03-14 00:46:00 +08:00 · 2024-03-14 00:36:28 +08:00 · 2024-03-14 00:32:49 +08:00 · 2024-03-13 23:32:22 +08:00 · 2024-03-13 14:11:47 +00:00
262 changed files with 11747 additions and 2574 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,10 @@
+root = true
+
+[*]
+end_of_line = lf
+indent_style = space
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+[{Makefile,**.mk}]
+indent_style = tab
--- a/.env.example
+++ b/.env.example
@@ -21,3 +21,6 @@ GT_GCS_CREDENTIAL_PATH = GCS credential path
 GT_GCS_ENDPOINT = GCS end point
 # Settings for kafka wal test
 GT_KAFKA_ENDPOINTS = localhost:9092
+
+# Setting for fuzz tests
+GT_MYSQL_ADDR = localhost:4002
--- a/.github/actions/fuzz-test/action.yaml
+++ b/.github/actions/fuzz-test/action.yaml
@@ -0,0 +1,13 @@
+name: Fuzz Test
+description: 'Fuzz test given setup and service'
+inputs:
+  target:
+    description: "The fuzz target to test"
+runs:
+  using: composite
+  steps:
+  - name: Run Fuzz Test
+    shell: bash
+    run: cargo fuzz run ${{ inputs.target }} --fuzz-dir tests-fuzz -D -s none -- -max_total_time=120
+    env:
+      GT_MYSQL_ADDR: 127.0.0.1:4002
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -102,7 +102,7 @@ jobs:
          shared-key: "build-binaries"
      - name: Build greptime binaries
        shell: bash
-        run: cargo build
+        run: cargo build --bin greptime --bin sqlness-runner
      - name: Pack greptime binaries
        shell: bash
        run: |
@@ -117,6 +117,46 @@ jobs:
          artifacts-dir: bins
          version: current

+  fuzztest: 
+    name: Fuzz Test
+    needs: build
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        target: [ "fuzz_create_table", "fuzz_alter_table" ]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: arduino/setup-protoc@v3
+      - uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          # Shares across multiple jobs
+          shared-key: "fuzz-test-targets"
+      - name: Set Rust Fuzz
+        shell: bash
+        run: |
+          sudo apt update && sudo apt install -y libfuzzer-14-dev
+          cargo install cargo-fuzz
+      - name: Download pre-built binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: bins
+          path: .
+      - name: Unzip binaries
+        run: tar -xvf ./bins.tar.gz
+      - name: Run GreptimeDB
+        run: | 
+          ./bins/greptime standalone start&
+      - name: Fuzz Test
+        uses: ./.github/actions/fuzz-test
+        env:
+          CUSTOM_LIBFUZZER_PATH: /usr/lib/llvm-14/lib/libFuzzer.a
+        with:
+          target: ${{ matrix.target }}
+
  sqlness:
    name: Sqlness Test
    needs: build
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -91,7 +91,7 @@ env:
  # The scheduled version is '${{ env.NEXT_RELEASE_VERSION }}-nightly-YYYYMMDD', like v0.2.0-nigthly-20230313;
  NIGHTLY_RELEASE_PREFIX: nightly
  # Note: The NEXT_RELEASE_VERSION should be modified manually by every formal release.
-  NEXT_RELEASE_VERSION: v0.7.0
+  NEXT_RELEASE_VERSION: v0.8.0

 jobs:
  allocate-runners:
@@ -288,7 +288,7 @@ jobs:
      - name: Set build windows result
        id: set-build-windows-result
        run: |
-          echo "build-windows-result=success" >> $GITHUB_OUTPUT    
+          echo "build-windows-result=success" >> $Env:GITHUB_OUTPUT

  release-images-to-dockerhub:
    name: Build and push images to DockerHub
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,7 @@ benchmarks/data
 *.code-workspace

 venv/
+
+# Fuzz tests 
+tests-fuzz/artifacts/
+tests-fuzz/corpus/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -29,6 +29,17 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"

+[[package]]
+name = "aes"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
+dependencies = [
+ "cfg-if 1.0.0",
+ "cipher",
+ "cpufeatures",
+]
+
 [[package]]
 name = "ahash"
 version = "0.7.7"
@@ -196,7 +207,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72"

 [[package]]
 name = "api"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "common-base",
 "common-decimal",
@@ -241,6 +252,15 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+dependencies = [
+ "derive_arbitrary",
+]
+
 [[package]]
 name = "arc-swap"
 version = "1.6.0"
@@ -675,7 +695,7 @@ dependencies = [

 [[package]]
 name = "auth"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-trait",
@@ -861,7 +881,7 @@ dependencies = [

 [[package]]
 name = "benchmarks"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "arrow",
 "chrono",
@@ -992,6 +1012,15 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "block-padding"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "borsh"
 version = "1.3.0"
@@ -1219,7 +1248,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

 [[package]]
 name = "catalog"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -1266,6 +1295,15 @@ dependencies = [
 "tokio",
 ]

+[[package]]
+name = "cbc"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
+dependencies = [
+ "cipher",
+]
+
 [[package]]
 name = "cc"
 version = "1.0.83"
@@ -1421,6 +1459,16 @@ dependencies = [
 "half 1.8.2",
 ]

+[[package]]
+name = "cipher"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
+dependencies = [
+ "crypto-common",
+ "inout",
+]
+
 [[package]]
 name = "clang-sys"
 version = "1.6.1"
@@ -1510,7 +1558,7 @@ checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"

 [[package]]
 name = "client"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -1546,7 +1594,7 @@ dependencies = [
 "session",
 "snafu",
 "substrait 0.17.1",
- "substrait 0.6.0",
+ "substrait 0.7.0",
 "tokio",
 "tokio-stream",
 "tonic 0.10.2",
@@ -1576,7 +1624,7 @@ dependencies = [

 [[package]]
 name = "cmd"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "anymap",
 "async-trait",
@@ -1629,7 +1677,7 @@ dependencies = [
 "session",
 "snafu",
 "store-api",
- "substrait 0.6.0",
+ "substrait 0.7.0",
 "table",
 "temp-env",
 "tikv-jemallocator",
@@ -1672,7 +1720,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"

 [[package]]
 name = "common-base"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "anymap",
 "bitvec",
@@ -1687,7 +1735,7 @@ dependencies = [

 [[package]]
 name = "common-catalog"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "chrono",
 "common-error",
@@ -1698,7 +1746,7 @@ dependencies = [

 [[package]]
 name = "common-config"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "common-base",
 "humantime-serde",
@@ -1709,7 +1757,7 @@ dependencies = [

 [[package]]
 name = "common-datasource"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "arrow",
 "arrow-schema",
@@ -1741,7 +1789,7 @@ dependencies = [

 [[package]]
 name = "common-decimal"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "arrow",
 "bigdecimal",
@@ -1755,7 +1803,7 @@ dependencies = [

 [[package]]
 name = "common-error"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "snafu",
 "strum 0.25.0",
@@ -1763,7 +1811,7 @@ dependencies = [

 [[package]]
 name = "common-function"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -1798,7 +1846,7 @@ dependencies = [

 [[package]]
 name = "common-greptimedb-telemetry"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-trait",
 "common-error",
@@ -1817,7 +1865,7 @@ dependencies = [

 [[package]]
 name = "common-grpc"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arrow-flight",
@@ -1847,7 +1895,7 @@ dependencies = [

 [[package]]
 name = "common-grpc-expr"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-trait",
@@ -1866,7 +1914,7 @@ dependencies = [

 [[package]]
 name = "common-macro"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "arc-swap",
 "common-query",
@@ -1881,7 +1929,7 @@ dependencies = [

 [[package]]
 name = "common-mem-prof"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "common-error",
 "common-macro",
@@ -1894,7 +1942,7 @@ dependencies = [

 [[package]]
 name = "common-meta"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-recursion",
@@ -1944,11 +1992,11 @@ dependencies = [

 [[package]]
 name = "common-plugins"
-version = "0.6.0"
+version = "0.7.0"

 [[package]]
 name = "common-procedure"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -1972,7 +2020,7 @@ dependencies = [

 [[package]]
 name = "common-procedure-test"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-trait",
 "common-procedure",
@@ -1980,7 +2028,7 @@ dependencies = [

 [[package]]
 name = "common-query"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-trait",
@@ -2003,7 +2051,7 @@ dependencies = [

 [[package]]
 name = "common-recordbatch"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "arc-swap",
 "common-base",
@@ -2023,7 +2071,7 @@ dependencies = [

 [[package]]
 name = "common-runtime"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-trait",
 "common-error",
@@ -2043,7 +2091,7 @@ dependencies = [

 [[package]]
 name = "common-telemetry"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "atty",
 "backtrace",
@@ -2071,7 +2119,7 @@ dependencies = [

 [[package]]
 name = "common-test-util"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "client",
 "common-query",
@@ -2083,7 +2131,7 @@ dependencies = [

 [[package]]
 name = "common-time"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "arrow",
 "chrono",
@@ -2099,14 +2147,14 @@ dependencies = [

 [[package]]
 name = "common-version"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "build-data",
 ]

 [[package]]
 name = "common-wal"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "common-base",
 "common-error",
@@ -2754,7 +2802,7 @@ dependencies = [

 [[package]]
 name = "datanode"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arrow-flight",
@@ -2812,7 +2860,7 @@ dependencies = [
 "snafu",
 "sql",
 "store-api",
- "substrait 0.6.0",
+ "substrait 0.7.0",
 "table",
 "tokio",
 "tokio-stream",
@@ -2826,7 +2874,7 @@ dependencies = [

 [[package]]
 name = "datatypes"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "arrow",
 "arrow-array",
@@ -2912,6 +2960,17 @@ dependencies = [
 "syn 2.0.43",
 ]

+[[package]]
+name = "derive_arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.43",
+]
+
 [[package]]
 name = "derive_builder"
 version = "0.11.2"
@@ -3302,7 +3361,7 @@ dependencies = [

 [[package]]
 name = "file-engine"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-trait",
@@ -3403,7 +3462,7 @@ dependencies = [

 [[package]]
 name = "flow"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "bimap",
@@ -3415,10 +3474,12 @@ dependencies = [
 "common-telemetry",
 "common-time",
 "datatypes",
+ "enum_dispatch",
 "hydroflow",
 "itertools 0.10.5",
 "num-traits",
 "serde",
+ "serde_json",
 "servers",
 "session",
 "snafu",
@@ -3458,7 +3519,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"

 [[package]]
 name = "frontend"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -3522,7 +3583,7 @@ dependencies = [
 "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
 "store-api",
 "strfmt",
- "substrait 0.6.0",
+ "substrait 0.7.0",
 "table",
 "tokio",
 "toml 0.8.8",
@@ -4291,7 +4352,7 @@ dependencies = [

 [[package]]
 name = "index"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-trait",
 "asynchronous-codec",
@@ -4406,6 +4467,16 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "inout"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
+dependencies = [
+ "block-padding",
+ "generic-array",
+]
+
 [[package]]
 name = "instant"
 version = "0.1.12"
@@ -4746,9 +4817,20 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.151"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]

 [[package]]
 name = "libgit2-sys"
@@ -4848,7 +4930,7 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"

 [[package]]
 name = "log-store"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -5137,7 +5219,7 @@ dependencies = [

 [[package]]
 name = "meta-client"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-trait",
@@ -5167,7 +5249,7 @@ dependencies = [

 [[package]]
 name = "meta-srv"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "anymap",
 "api",
@@ -5247,7 +5329,7 @@ dependencies = [

 [[package]]
 name = "metric-engine"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "aquamarine",
@@ -5307,9 +5389,9 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.10"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
 "libc",
 "log",
@@ -5319,7 +5401,7 @@ dependencies = [

 [[package]]
 name = "mito2"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "anymap",
 "api",
@@ -5380,6 +5462,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tokio-util",
+ "toml 0.8.8",
 "uuid",
 ]

@@ -5921,9 +6004,18 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "object-pool"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee9a3e7196d09ec86002b939f1576e8e446d58def8fd48fe578e2c72d5328d68"
+dependencies = [
+ "parking_lot 0.11.2",
+]
+
 [[package]]
 name = "object-store"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -5979,9 +6071,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"

 [[package]]
 name = "opendal"
-version = "0.44.2"
+version = "0.45.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4af824652d4d2ffabf606d337a071677ae621b05622adf35df9562f69d9b4498"
+checksum = "52c17c077f23fa2d2c25d9d22af98baa43b8bbe2ef0de80cf66339aa70401467"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -5997,7 +6089,7 @@ dependencies = [
 "md-5",
 "once_cell",
 "percent-encoding",
- "quick-xml 0.30.0",
+ "quick-xml 0.31.0",
 "reqsign",
 "reqwest",
 "serde",
@@ -6166,7 +6258,7 @@ dependencies = [

 [[package]]
 name = "operator"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-trait",
@@ -6213,7 +6305,7 @@ dependencies = [
 "sql",
 "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
 "store-api",
- "substrait 0.6.0",
+ "substrait 0.7.0",
 "table",
 "tokio",
 "tonic 0.10.2",
@@ -6444,7 +6536,7 @@ dependencies = [

 [[package]]
 name = "partition"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "async-trait",
@@ -6466,6 +6558,8 @@ dependencies = [
 "serde",
 "serde_json",
 "snafu",
+ "sql",
+ "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
 "store-api",
 "table",
 ]
@@ -6488,6 +6582,16 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"

+[[package]]
+name = "pbkdf2"
+version = "0.12.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
+dependencies = [
+ "digest",
+ "hmac",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -6528,6 +6632,12 @@ version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"

+[[package]]
+name = "permutation"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7"
+
 [[package]]
 name = "pest"
 version = "2.7.5"
@@ -6712,6 +6822,21 @@ dependencies = [
 "spki 0.7.3",
 ]

+[[package]]
+name = "pkcs5"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e847e2c91a18bfa887dd028ec33f2fe6f25db77db3619024764914affe8b69a6"
+dependencies = [
+ "aes",
+ "cbc",
+ "der 0.7.8",
+ "pbkdf2",
+ "scrypt",
+ "sha2",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkcs8"
 version = "0.8.0"
@@ -6730,6 +6855,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
 dependencies = [
 "der 0.7.8",
+ "pkcs5",
+ "rand_core",
 "spki 0.7.3",
 ]

@@ -6769,7 +6896,7 @@ dependencies = [

 [[package]]
 name = "plugins"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "auth",
 "common-base",
@@ -7036,7 +7163,7 @@ dependencies = [

 [[package]]
 name = "promql"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "ahash 0.8.6",
 "async-recursion",
@@ -7247,7 +7374,7 @@ dependencies = [

 [[package]]
 name = "puffin"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-trait",
 "bitflags 2.4.1",
@@ -7368,7 +7495,7 @@ dependencies = [

 [[package]]
 name = "query"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "ahash 0.8.6",
 "api",
@@ -7429,7 +7556,7 @@ dependencies = [
 "stats-cli",
 "store-api",
 "streaming-stats",
- "substrait 0.6.0",
+ "substrait 0.7.0",
 "table",
 "tokio",
 "tokio-stream",
@@ -7444,16 +7571,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "quick-xml"
-version = "0.30.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
-dependencies = [
- "memchr",
- "serde",
-]
-
 [[package]]
 name = "quick-xml"
 version = "0.31.0"
@@ -7736,9 +7853,9 @@ dependencies = [

 [[package]]
 name = "reqsign"
-version = "0.14.6"
+version = "0.14.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dce87f66ba6c6acef277a729f989a0eca946cb9ce6a15bcc036bda0f72d4b9fd"
+checksum = "43e319d9de9ff4d941abf4ac718897118b0fe04577ea3f8e0f5788971784eef5"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -7763,7 +7880,6 @@ dependencies = [
 "serde_json",
 "sha1",
 "sha2",
- "tokio",
 ]

 [[package]]
@@ -7956,6 +8072,7 @@ dependencies = [
 "pkcs1 0.7.5",
 "pkcs8 0.10.2",
 "rand_core",
+ "sha2",
 "signature",
 "spki 0.7.3",
 "subtle",
@@ -8690,6 +8807,15 @@ dependencies = [
 "bytemuck",
 ]

+[[package]]
+name = "salsa20"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213"
+dependencies = [
+ "cipher",
+]
+
 [[package]]
 name = "same-file"
 version = "1.0.6"
@@ -8747,7 +8873,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"

 [[package]]
 name = "script"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -8803,6 +8929,17 @@ dependencies = [
 "tokio-test",
 ]

+[[package]]
+name = "scrypt"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f"
+dependencies = [
+ "pbkdf2",
+ "salsa20",
+ "sha2",
+]
+
 [[package]]
 name = "sct"
 version = "0.7.1"
@@ -9020,7 +9157,7 @@ dependencies = [

 [[package]]
 name = "servers"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "aide",
 "api",
@@ -9054,6 +9191,7 @@ dependencies = [
 "common-test-util",
 "common-time",
 "common-version",
+ "criterion",
 "datafusion",
 "datafusion-common",
 "datafusion-expr",
@@ -9061,6 +9199,7 @@ dependencies = [
 "derive_builder 0.12.0",
 "digest",
 "futures",
+ "hashbrown 0.14.3",
 "headers",
 "hex",
 "hostname",
@@ -9073,11 +9212,13 @@ dependencies = [
 "mime_guess",
 "mysql_async",
 "notify",
+ "object-pool",
 "once_cell",
 "openmetrics-parser",
 "opensrv-mysql",
 "opentelemetry-proto 0.3.0",
 "parking_lot 0.12.1",
+ "permutation",
 "pgwire",
 "pin-project",
 "postgres-types",
@@ -9122,7 +9263,7 @@ dependencies = [

 [[package]]
 name = "session"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arc-swap",
@@ -9392,7 +9533,7 @@ dependencies = [

 [[package]]
 name = "sql"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "common-base",
@@ -9444,7 +9585,7 @@ dependencies = [

 [[package]]
 name = "sqlness-runner"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-trait",
 "clap 4.4.11",
@@ -9651,7 +9792,7 @@ dependencies = [

 [[package]]
 name = "store-api"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "aquamarine",
@@ -9791,7 +9932,7 @@ dependencies = [

 [[package]]
 name = "substrait"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "async-recursion",
 "async-trait",
@@ -9964,7 +10105,7 @@ dependencies = [

 [[package]]
 name = "table"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "anymap",
 "async-trait",
@@ -10076,17 +10217,21 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"

 [[package]]
 name = "tests-fuzz"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
+ "arbitrary",
 "async-trait",
 "common-error",
 "common-macro",
 "common-query",
+ "common-runtime",
 "common-telemetry",
+ "common-time",
 "datatypes",
 "derive_builder 0.12.0",
 "dotenv",
 "lazy_static",
+ "libfuzzer-sys",
 "partition",
 "rand",
 "rand_chacha",
@@ -10101,7 +10246,7 @@ dependencies = [

 [[package]]
 name = "tests-integration"
-version = "0.6.0"
+version = "0.7.0"
 dependencies = [
 "api",
 "arrow-flight",
@@ -10158,7 +10303,7 @@ dependencies = [
 "sql",
 "sqlx",
 "store-api",
- "substrait 0.6.0",
+ "substrait 0.7.0",
 "table",
 "tempfile",
 "time",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,7 +62,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "0.6.0"
+version = "0.7.0"
 edition = "2021"
 license = "Apache-2.0"

@@ -134,7 +134,7 @@ reqwest = { version = "0.11", default-features = false, features = [
 rskafka = "0.5"
 rust_decimal = "1.33"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0"
+serde_json = { version = "1.0", features = ["float_roundtrip"] }
 serde_with = "3"
 smallvec = { version = "1", features = ["serde"] }
 snafu = "0.7"
--- a/5
+++ b/5
@@ -3,6 +3,7 @@ CARGO_PROFILE ?=
 FEATURES ?=
 TARGET_DIR ?=
 TARGET ?=
+BUILD_BIN ?= greptime
 CARGO_BUILD_OPTS := --locked
 IMAGE_REGISTRY ?= docker.io
 IMAGE_NAMESPACE ?= greptime
@@ -45,6 +46,10 @@ ifneq ($(strip $(TARGET)),)
 	CARGO_BUILD_OPTS += --target ${TARGET}
 endif

+ifneq ($(strip $(BUILD_BIN)),)
+	CARGO_BUILD_OPTS += --bin ${BUILD_BIN}
+endif
+
 ifneq ($(strip $(RELEASE)),)
 	CARGO_BUILD_OPTS += --release
 endif
--- a/benchmarks/src/bin/nyc-taxi.rs
+++ b/benchmarks/src/bin/nyc-taxi.rs
@@ -29,7 +29,7 @@ use client::api::v1::column::Values;
 use client::api::v1::{
    Column, ColumnDataType, ColumnDef, CreateTableExpr, InsertRequest, InsertRequests, SemanticType,
 };
-use client::{Client, Database, Output, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use futures_util::TryStreamExt;
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
@@ -502,9 +502,9 @@ async fn do_query(num_iter: usize, db: &Database, table_name: &str) {
        for i in 0..num_iter {
            let now = Instant::now();
            let res = db.sql(&query).await.unwrap();
-            match res {
-                Output::AffectedRows(_) | Output::RecordBatches(_) => (),
-                Output::Stream(stream, _) => {
+            match res.data {
+                OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => (),
+                OutputData::Stream(stream) => {
                    stream.try_collect::<Vec<_>>().await.unwrap();
                }
            }
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -138,6 +138,18 @@ mem_threshold_on_create = "64M"
 # File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
 intermediate_path = ""

+[region_engine.mito.memtable]
+# Memtable type.
+# - "experimental": experimental memtable
+# - "time_series": time-series memtable (deprecated)
+type = "experimental"
+# The max number of keys in one shard.
+index_max_keys_per_shard = 8192
+# The max rows of data inside the actively writing buffer in one shard.
+data_freeze_threshold = 32768
+# Max dictionary bytes.
+fork_dictionary_bytes = "1GiB"
+
 # Log options, see `standalone.example.toml`
 # [logging]
 # dir = "/tmp/greptimedb/logs"
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -244,6 +244,18 @@ mem_threshold_on_create = "64M"
 # File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
 intermediate_path = ""

+[region_engine.mito.memtable]
+# Memtable type.
+# - "experimental": experimental memtable
+# - "time_series": time-series memtable (deprecated)
+type = "experimental"
+# The max number of keys in one shard.
+index_max_keys_per_shard = 8192
+# The max rows of data inside the actively writing buffer in one shard.
+data_freeze_threshold = 32768
+# Max dictionary bytes.
+fork_dictionary_bytes = "1GiB"
+
 # Log options
 # [logging]
 # Specify logs directory.
@@ -254,10 +266,11 @@ intermediate_path = ""
 # enable_otlp_tracing = false
 # tracing exporter endpoint with format `ip:port`, we use grpc oltp as exporter, default endpoint is `localhost:4317`
 # otlp_endpoint = "localhost:4317"
-# The percentage of tracing will be sampled and exported. Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1. ratio > 1 are treated as 1. Fractions < 0 are treated as 0
-# tracing_sample_ratio = 1.0
 # Whether to append logs to stdout. Defaults to true.
 # append_stdout = true
+# The percentage of tracing will be sampled and exported. Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1. ratio > 1 are treated as 1. Fractions < 0 are treated as 0
+# [logging.tracing_sample_ratio]
+# default_ratio = 0.0

 # Standalone export the metrics generated by itself
 # encoded to Prometheus remote-write format
--- a/docs/rfcs/2023-05-09-distributed-planner.md
+++ b/docs/rfcs/2023-05-09-distributed-planner.md
@@ -79,7 +79,7 @@ This RFC proposes to add a new expression node `MergeScan` to merge result from
 │               │    │                             │
 └─Frontend──────┘    └─Remote-Sources──────────────┘
 ```
-This merge operation simply chains all the the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.
+This merge operation simply chains all the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.

 ## Commutativity of MergeScan

--- a/docs/rfcs/2024-02-21-multi-dimension-partition-rule/2d-example.png
+++ b/docs/rfcs/2024-02-21-multi-dimension-partition-rule/2d-example.png
--- a/docs/rfcs/2024-02-21-multi-dimension-partition-rule/rfc.md
+++ b/docs/rfcs/2024-02-21-multi-dimension-partition-rule/rfc.md
@@ -0,0 +1,101 @@
+---
+Feature Name:  Multi-dimension Partition Rule
+Tracking Issue: https://github.com/GreptimeTeam/greptimedb/issues/3351
+Date: 2024-02-21
+Author: "Ruihang Xia <waynestxia@gmail.com>"
+---
+
+# Summary
+
+A new region partition scheme that runs on multiple dimensions of the key space. The partition rule is defined by a set of simple expressions on the partition key columns.
+
+# Motivation
+
+The current partition rule is from MySQL's [`RANGE Partition`](https://dev.mysql.com/doc/refman/8.0/en/partitioning-range.html), which is based on a single dimension. It is sort of a [Hilbert Curve](https://en.wikipedia.org/wiki/Hilbert_curve) and pick several point on the curve to divide the space. It is neither easy to understand how the data get partitioned nor flexible enough to handle complex partitioning requirements.
+
+Considering the future requirements like region repartitioning or autonomous rebalancing, where both workload and partition may change frequently. Here proposes a new region partition scheme that uses a set of simple expressions on the partition key columns to divide the key space.
+
+# Details
+
+## Partition rule
+
+First, we define a simple expression that can be used to define the partition rule. The simple expression is a binary expression expression on the partition key columns that can be evaluated to a boolean value. The binary operator is limited to comparison operators only, like `=`, `!=`, `>`, `>=`, `<`, `<=`. And the operands are limited either literal value or partition column.
+
+Example of valid simple expressions are $`col_A = 10`$, $`col_A \gt 10 \& col_B \gt 20`$ or $`col_A \ne 10`$.
+
+Those expressions can be used as predicates to divide the key space into different regions. The following example have two partition columns `Col A` and `Col B`, and four partitioned regions.
+
+```math
+\left\{\begin{aligned}
+ 
+&col_A \le 10 &Region_1 \\
+&10 \lt col_A \& col_A \le 20 &Region_2 \\
+&20 \lt col_A \space \& \space col_B \lt 100 &Region_3 \\
+&20 \lt col_A \space \& \space col_B \ge 100 &Region_4
+
+\end{aligned}\right\}
+```
+
+An advantage of this scheme is that it is easy to understand how the data get partitioned. The above example can be visualized in a 2D space (two partition column is involved in the example).
+
+![example](2d-example.png)
+
+Here each expression draws a line in the 2D space. Managing data partitioning becomes a matter of drawing lines in the key space.
+
+To make it easy to use, there is a "default region" which catches all the data that doesn't match any of previous expressions. The default region exist by default and do not need to specify. It is also possible to remove this default region if the DB finds it is not necessary.
+
+## SQL interface
+
+The SQL interface is in response to two parts: specifying the partition columns and the partition rule. Thouth we are targeting an autonomous system, it's still allowed to give some bootstrap rules or hints on creating table.
+
+Partition column is specified by `PARTITION ON COLUMNS` sub-clause in `CREATE TABLE`:
+
+```sql
+CREATE TABLE t (...)
+PARTITION ON COLUMNS (...) ();
+```
+
+Two following brackets are for partition columns and partition rule respectively.
+
+Columns provided here are only used as an allow-list of how the partition rule can be defined. Which means (a) the sequence between columns doesn't matter, (b) the columns provided here are not necessarily being used in the partition rule.
+
+The partition rule part is a list of comma-separated simple expressions. Expressions here are not corresponding to region, as they might be changed by system to fit various workload.
+
+A full example of `CREATE TABLE` with partition rule is:
+
+```sql
+CREATE TABLE IF NOT EXISTS demo (
+  a STRING,
+  b STRING,
+  c STRING,
+  d STRING,
+  ts TIMESTAMP,
+  memory DOUBLE,
+  TIME INDEX (ts),
+  PRIMARY KEY (a, b, c, d)
+)
+PARTITION ON COLUMNS (c, b, a) (
+  a < 10,
+  10 >= a AND a < 20,
+  20 >= a AND b < 100,
+  20 >= a AND b > 100
+)
+```
+
+## Combine with storage
+
+Examining columns separately suits our columnar storage very well in two aspects.
+
+1. The simple expression can be pushed down to storage and file format, and is likely to hit existing index. Makes pruning operation very efficient.
+
+2. Columns in columnar storage are not tightly coupled like in the traditional row storages, which means we can easily add or remove columns from partition rule without much impact (like a global reshuffle) on data.
+
+The data file itself can be "projected" to the key space as a polyhedron, it is guaranteed that each plane is in parallel with some coordinate planes (in a 2D scenario, this is saying that all the files can be projected to a rectangle). Thus partition or repartition also only need to consider related columns.
+
+![sst-project](sst-project.png)
+
+An additional limitation is that considering how the index works and how we organize the primary keys at present, the partition columns are limited to be a subset of primary keys for better performance.
+
+# Drawbacks
+
+This is a breaking change.
--- a/docs/rfcs/2024-02-21-multi-dimension-partition-rule/sst-project.png
+++ b/docs/rfcs/2024-02-21-multi-dimension-partition-rule/sst-project.png
--- a/src/catalog/src/information_schema.rs
+++ b/src/catalog/src/information_schema.rs
@@ -19,9 +19,9 @@ mod partitions;
 mod predicate;
 mod region_peers;
 mod runtime_metrics;
-mod schemata;
+pub mod schemata;
 mod table_names;
-mod tables;
+pub mod tables;

 use std::collections::HashMap;
 use std::sync::{Arc, Weak};
--- a/src/catalog/src/information_schema/schemata.rs
+++ b/src/catalog/src/information_schema/schemata.rs
@@ -37,8 +37,8 @@ use crate::error::{
 use crate::information_schema::{InformationTable, Predicates};
 use crate::CatalogManager;

-const CATALOG_NAME: &str = "catalog_name";
-const SCHEMA_NAME: &str = "schema_name";
+pub const CATALOG_NAME: &str = "catalog_name";
+pub const SCHEMA_NAME: &str = "schema_name";
 const DEFAULT_CHARACTER_SET_NAME: &str = "default_character_set_name";
 const DEFAULT_COLLATION_NAME: &str = "default_collation_name";
 const INIT_CAPACITY: usize = 42;
--- a/src/catalog/src/information_schema/tables.rs
+++ b/src/catalog/src/information_schema/tables.rs
@@ -39,10 +39,10 @@ use crate::error::{
 use crate::information_schema::{InformationTable, Predicates};
 use crate::CatalogManager;

-const TABLE_CATALOG: &str = "table_catalog";
-const TABLE_SCHEMA: &str = "table_schema";
-const TABLE_NAME: &str = "table_name";
-const TABLE_TYPE: &str = "table_type";
+pub const TABLE_CATALOG: &str = "table_catalog";
+pub const TABLE_SCHEMA: &str = "table_schema";
+pub const TABLE_NAME: &str = "table_name";
+pub const TABLE_TYPE: &str = "table_type";
 const TABLE_ID: &str = "table_id";
 const ENGINE: &str = "engine";
 const INIT_CAPACITY: usize = 42;
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -307,7 +307,7 @@ impl Database {
                        reason: "Expect 'AffectedRows' Flight messages to be the one and the only!"
                    }
                );
-                Ok(Output::AffectedRows(rows))
+                Ok(Output::new_with_affected_rows(rows))
            }
            FlightMessage::Recordbatch(_) | FlightMessage::Metrics(_) => {
                IllegalFlightMessagesSnafu {
@@ -340,7 +340,7 @@ impl Database {
                    output_ordering: None,
                    metrics: Default::default(),
                };
-                Ok(Output::new_stream(Box::pin(record_batch_stream)))
+                Ok(Output::new_with_stream(Box::pin(record_batch_stream)))
            }
        }
    }
--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -26,7 +26,7 @@ use api::v1::greptime_response::Response;
 use api::v1::{AffectedRows, GreptimeResponse};
 pub use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_error::status_code::StatusCode;
-pub use common_query::Output;
+pub use common_query::{Output, OutputData, OutputMeta};
 pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
 use snafu::OptionExt;

--- a/src/cmd/src/cli/bench.rs
+++ b/src/cmd/src/cli/bench.rs
@@ -62,7 +62,9 @@ pub struct BenchTableMetadataCommand {

 impl BenchTableMetadataCommand {
    pub async fn build(&self) -> Result<Instance> {
-        let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr]).await.unwrap();
+        let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr], 128)
+            .await
+            .unwrap();

        let table_metadata_manager = Arc::new(TableMetadataManager::new(etcd_store));

--- a/src/cmd/src/cli/export.rs
+++ b/src/cmd/src/cli/export.rs
@@ -19,8 +19,7 @@ use async_trait::async_trait;
 use clap::{Parser, ValueEnum};
 use client::api::v1::auth_header::AuthScheme;
 use client::api::v1::Basic;
-use client::{Client, Database, DEFAULT_SCHEMA_NAME};
-use common_query::Output;
+use client::{Client, Database, OutputData, DEFAULT_SCHEMA_NAME};
 use common_recordbatch::util::collect;
 use common_telemetry::{debug, error, info, warn};
 use datatypes::scalars::ScalarVector;
@@ -142,7 +141,7 @@ impl Export {
                    .with_context(|_| RequestDatabaseSnafu {
                        sql: "show databases".to_string(),
                    })?;
-            let Output::Stream(stream, _) = result else {
+            let OutputData::Stream(stream) = result.data else {
                NotDataFromOutputSnafu.fail()?
            };
            let record_batch = collect(stream)
@@ -183,7 +182,7 @@ impl Export {
            .sql(&sql)
            .await
            .with_context(|_| RequestDatabaseSnafu { sql })?;
-        let Output::Stream(stream, _) = result else {
+        let OutputData::Stream(stream) = result.data else {
            NotDataFromOutputSnafu.fail()?
        };
        let Some(record_batch) = collect(stream)
@@ -235,7 +234,7 @@ impl Export {
            .sql(&sql)
            .await
            .with_context(|_| RequestDatabaseSnafu { sql })?;
-        let Output::Stream(stream, _) = result else {
+        let OutputData::Stream(stream) = result.data else {
            NotDataFromOutputSnafu.fail()?
        };
        let record_batch = collect(stream)
--- a/src/cmd/src/cli/repl.rs
+++ b/src/cmd/src/cli/repl.rs
@@ -19,7 +19,7 @@ use std::time::Instant;
 use catalog::kvbackend::{
    CachedMetaKvBackend, CachedMetaKvBackendBuilder, KvBackendCatalogManager,
 };
-use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_base::Plugins;
 use common_error::ext::ErrorExt;
 use common_query::Output;
@@ -184,15 +184,15 @@ impl Repl {
        }
        .context(RequestDatabaseSnafu { sql: &sql })?;

-        let either = match output {
-            Output::Stream(s, _) => {
+        let either = match output.data {
+            OutputData::Stream(s) => {
                let x = RecordBatches::try_collect(s)
                    .await
                    .context(CollectRecordBatchesSnafu)?;
                Either::Left(x)
            }
-            Output::RecordBatches(x) => Either::Left(x),
-            Output::AffectedRows(rows) => Either::Right(rows),
+            OutputData::RecordBatches(x) => Either::Left(x),
+            OutputData::AffectedRows(rows) => Either::Right(rows),
        };

        let end = Instant::now();
--- a/src/cmd/src/cli/upgrade.rs
+++ b/src/cmd/src/cli/upgrade.rs
@@ -70,7 +70,7 @@ impl UpgradeCommand {
                etcd_addr: &self.etcd_addr,
            })?;
        let tool = MigrateTableMetadata {
-            etcd_store: EtcdStore::with_etcd_client(client),
+            etcd_store: EtcdStore::with_etcd_client(client, 128),
            dryrun: self.dryrun,
            skip_catalog_keys: self.skip_catalog_keys,
            skip_table_global_keys: self.skip_table_global_keys,
--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -117,10 +117,12 @@ struct StartCommand {
    /// The working home directory of this metasrv instance.
    #[clap(long)]
    data_home: Option<String>,
-
    /// If it's not empty, the metasrv will store all data with this key prefix.
    #[clap(long, default_value = "")]
    store_key_prefix: String,
+    /// The max operations per txn
+    #[clap(long)]
+    max_txn_ops: Option<usize>,
 }

 impl StartCommand {
@@ -181,6 +183,10 @@ impl StartCommand {
            opts.store_key_prefix = self.store_key_prefix.clone()
        }

+        if let Some(max_txn_ops) = self.max_txn_ops {
+            opts.max_txn_ops = max_txn_ops;
+        }
+
        // Disable dashboard in metasrv.
        opts.http.disable_dashboard = true;

--- a/src/common/datasource/src/object_store/s3.rs
+++ b/src/common/datasource/src/object_store/s3.rs
@@ -28,12 +28,15 @@ const REGION: &str = "region";
 const ENABLE_VIRTUAL_HOST_STYLE: &str = "enable_virtual_host_style";

 pub fn is_supported_in_s3(key: &str) -> bool {
-    key == ENDPOINT
-        || key == ACCESS_KEY_ID
-        || key == SECRET_ACCESS_KEY
-        || key == SESSION_TOKEN
-        || key == REGION
-        || key == ENABLE_VIRTUAL_HOST_STYLE
+    [
+        ENDPOINT,
+        ACCESS_KEY_ID,
+        SECRET_ACCESS_KEY,
+        SESSION_TOKEN,
+        REGION,
+        ENABLE_VIRTUAL_HOST_STYLE,
+    ]
+    .contains(&key)
 }

 pub fn build_s3_backend(
--- a/src/common/function/src/scalars/math.rs
+++ b/src/common/function/src/scalars/math.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+mod clamp;
 mod modulo;
 mod pow;
 mod rate;
@@ -19,6 +20,7 @@ mod rate;
 use std::fmt;
 use std::sync::Arc;

+pub use clamp::ClampFunction;
 use common_query::error::{GeneralDataFusionSnafu, Result};
 use common_query::prelude::Signature;
 use datafusion::error::DataFusionError;
@@ -40,7 +42,8 @@ impl MathFunction {
        registry.register(Arc::new(ModuloFunction));
        registry.register(Arc::new(PowFunction));
        registry.register(Arc::new(RateFunction));
-        registry.register(Arc::new(RangeFunction))
+        registry.register(Arc::new(RangeFunction));
+        registry.register(Arc::new(ClampFunction));
    }
 }

--- a/src/common/function/src/scalars/math/clamp.rs
+++ b/src/common/function/src/scalars/math/clamp.rs
@@ -0,0 +1,403 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{self, Display};
+use std::sync::Arc;
+
+use common_query::error::{InvalidFuncArgsSnafu, Result};
+use common_query::prelude::Signature;
+use datafusion::arrow::array::{ArrayIter, PrimitiveArray};
+use datafusion::logical_expr::Volatility;
+use datatypes::data_type::{ConcreteDataType, DataType};
+use datatypes::prelude::VectorRef;
+use datatypes::types::LogicalPrimitiveType;
+use datatypes::value::TryAsPrimitive;
+use datatypes::vectors::PrimitiveVector;
+use datatypes::with_match_primitive_type_id;
+use snafu::{ensure, OptionExt};
+
+use crate::function::Function;
+
+#[derive(Clone, Debug, Default)]
+pub struct ClampFunction;
+
+const CLAMP_NAME: &str = "clamp";
+
+impl Function for ClampFunction {
+    fn name(&self) -> &str {
+        CLAMP_NAME
+    }
+
+    fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        // Type check is done by `signature`
+        Ok(input_types[0].clone())
+    }
+
+    fn signature(&self) -> Signature {
+        // input, min, max
+        Signature::uniform(3, ConcreteDataType::numerics(), Volatility::Immutable)
+    }
+
+    fn eval(
+        &self,
+        _func_ctx: crate::function::FunctionContext,
+        columns: &[VectorRef],
+    ) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 3,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly 3, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+        ensure!(
+            columns[0].data_type().is_numeric(),
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The first arg's type is not numeric, have: {}",
+                    columns[0].data_type()
+                ),
+            }
+        );
+        ensure!(
+            columns[0].data_type() == columns[1].data_type()
+                && columns[1].data_type() == columns[2].data_type(),
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "Arguments don't have identical types: {}, {}, {}",
+                    columns[0].data_type(),
+                    columns[1].data_type(),
+                    columns[2].data_type()
+                ),
+            }
+        );
+        ensure!(
+            columns[1].len() == 1 && columns[2].len() == 1,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The second and third args should be scalar, have: {:?}, {:?}",
+                    columns[1], columns[2]
+                ),
+            }
+        );
+
+        with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| {
+            let input_array = columns[0].to_arrow_array();
+            let input = input_array
+                    .as_any()
+                    .downcast_ref::<PrimitiveArray<<$S as LogicalPrimitiveType>::ArrowPrimitive>>()
+                    .unwrap();
+
+            let min = TryAsPrimitive::<$S>::try_as_primitive(&columns[1].get(0))
+                .with_context(|| {
+                    InvalidFuncArgsSnafu {
+                        err_msg: "The second arg should not be none",
+                    }
+                })?;
+            let max = TryAsPrimitive::<$S>::try_as_primitive(&columns[2].get(0))
+                .with_context(|| {
+                    InvalidFuncArgsSnafu {
+                        err_msg: "The third arg should not be none",
+                    }
+                })?;
+
+            // ensure min <= max
+            ensure!(
+                min <= max,
+                    InvalidFuncArgsSnafu {
+                        err_msg: format!(
+                        "The second arg should be less than or equal to the third arg, have: {:?}, {:?}",
+                        columns[1], columns[2]
+                    ),
+                }
+            );
+
+            clamp_impl::<$S, true, true>(input, min, max)
+        },{
+            unreachable!()
+        })
+    }
+}
+
+impl Display for ClampFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", CLAMP_NAME.to_ascii_uppercase())
+    }
+}
+
+fn clamp_impl<T: LogicalPrimitiveType, const CLAMP_MIN: bool, const CLAMP_MAX: bool>(
+    input: &PrimitiveArray<T::ArrowPrimitive>,
+    min: T::Native,
+    max: T::Native,
+) -> Result<VectorRef> {
+    common_telemetry::info!("[DEBUG] min {min:?}, max {max:?}");
+
+    let iter = ArrayIter::new(input);
+    let result = iter.map(|x| {
+        x.map(|x| {
+            if CLAMP_MIN && x < min {
+                min
+            } else if CLAMP_MAX && x > max {
+                max
+            } else {
+                x
+            }
+        })
+    });
+    let result = PrimitiveArray::<T::ArrowPrimitive>::from_iter(result);
+    Ok(Arc::new(PrimitiveVector::<T>::from(result)))
+}
+
+#[cfg(test)]
+mod test {
+
+    use std::sync::Arc;
+
+    use datatypes::prelude::ScalarVector;
+    use datatypes::vectors::{
+        ConstantVector, Float64Vector, Int64Vector, StringVector, UInt64Vector,
+    };
+
+    use super::*;
+    use crate::function::FunctionContext;
+
+    #[test]
+    fn clamp_i64() {
+        let inputs = [
+            (
+                vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
+                -1,
+                10,
+                vec![Some(-1), Some(-1), Some(-1), Some(0), Some(1), Some(2)],
+            ),
+            (
+                vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
+                0,
+                0,
+                vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
+            ),
+            (
+                vec![Some(-3), None, Some(-1), None, None, Some(2)],
+                -2,
+                1,
+                vec![Some(-2), None, Some(-1), None, None, Some(1)],
+            ),
+            (
+                vec![None, None, None, None, None],
+                0,
+                1,
+                vec![None, None, None, None, None],
+            ),
+        ];
+
+        let func = ClampFunction;
+        for (in_data, min, max, expected) in inputs {
+            let args = [
+                Arc::new(Int64Vector::from(in_data)) as _,
+                Arc::new(Int64Vector::from_vec(vec![min])) as _,
+                Arc::new(Int64Vector::from_vec(vec![max])) as _,
+            ];
+            let result = func
+                .eval(FunctionContext::default(), args.as_slice())
+                .unwrap();
+            let expected: VectorRef = Arc::new(Int64Vector::from(expected));
+            assert_eq!(expected, result);
+        }
+    }
+
+    #[test]
+    fn clamp_u64() {
+        let inputs = [
+            (
+                vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
+                1,
+                3,
+                vec![Some(1), Some(1), Some(2), Some(3), Some(3), Some(3)],
+            ),
+            (
+                vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
+                0,
+                0,
+                vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
+            ),
+            (
+                vec![Some(0), None, Some(2), None, None, Some(5)],
+                1,
+                3,
+                vec![Some(1), None, Some(2), None, None, Some(3)],
+            ),
+            (
+                vec![None, None, None, None, None],
+                0,
+                1,
+                vec![None, None, None, None, None],
+            ),
+        ];
+
+        let func = ClampFunction;
+        for (in_data, min, max, expected) in inputs {
+            let args = [
+                Arc::new(UInt64Vector::from(in_data)) as _,
+                Arc::new(UInt64Vector::from_vec(vec![min])) as _,
+                Arc::new(UInt64Vector::from_vec(vec![max])) as _,
+            ];
+            let result = func
+                .eval(FunctionContext::default(), args.as_slice())
+                .unwrap();
+            let expected: VectorRef = Arc::new(UInt64Vector::from(expected));
+            assert_eq!(expected, result);
+        }
+    }
+
+    #[test]
+    fn clamp_f64() {
+        let inputs = [
+            (
+                vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
+                -1.0,
+                10.0,
+                vec![Some(-1.0), Some(-1.0), Some(-1.0), Some(0.0), Some(1.0)],
+            ),
+            (
+                vec![Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
+                0.0,
+                0.0,
+                vec![Some(0.0), Some(0.0), Some(0.0), Some(0.0)],
+            ),
+            (
+                vec![Some(-3.0), None, Some(-1.0), None, None, Some(2.0)],
+                -2.0,
+                1.0,
+                vec![Some(-2.0), None, Some(-1.0), None, None, Some(1.0)],
+            ),
+            (
+                vec![None, None, None, None, None],
+                0.0,
+                1.0,
+                vec![None, None, None, None, None],
+            ),
+        ];
+
+        let func = ClampFunction;
+        for (in_data, min, max, expected) in inputs {
+            let args = [
+                Arc::new(Float64Vector::from(in_data)) as _,
+                Arc::new(Float64Vector::from_vec(vec![min])) as _,
+                Arc::new(Float64Vector::from_vec(vec![max])) as _,
+            ];
+            let result = func
+                .eval(FunctionContext::default(), args.as_slice())
+                .unwrap();
+            let expected: VectorRef = Arc::new(Float64Vector::from(expected));
+            assert_eq!(expected, result);
+        }
+    }
+
+    #[test]
+    fn clamp_const_i32() {
+        let input = vec![Some(5)];
+        let min = 2;
+        let max = 4;
+
+        let func = ClampFunction;
+        let args = [
+            Arc::new(ConstantVector::new(Arc::new(Int64Vector::from(input)), 1)) as _,
+            Arc::new(Int64Vector::from_vec(vec![min])) as _,
+            Arc::new(Int64Vector::from_vec(vec![max])) as _,
+        ];
+        let result = func
+            .eval(FunctionContext::default(), args.as_slice())
+            .unwrap();
+        let expected: VectorRef = Arc::new(Int64Vector::from(vec![Some(4)]));
+        assert_eq!(expected, result);
+    }
+
+    #[test]
+    fn clamp_invalid_min_max() {
+        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
+        let min = 10.0;
+        let max = -1.0;
+
+        let func = ClampFunction;
+        let args = [
+            Arc::new(Float64Vector::from(input)) as _,
+            Arc::new(Float64Vector::from_vec(vec![min])) as _,
+            Arc::new(Float64Vector::from_vec(vec![max])) as _,
+        ];
+        let result = func.eval(FunctionContext::default(), args.as_slice());
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn clamp_type_not_match() {
+        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
+        let min = -1;
+        let max = 10;
+
+        let func = ClampFunction;
+        let args = [
+            Arc::new(Float64Vector::from(input)) as _,
+            Arc::new(Int64Vector::from_vec(vec![min])) as _,
+            Arc::new(UInt64Vector::from_vec(vec![max])) as _,
+        ];
+        let result = func.eval(FunctionContext::default(), args.as_slice());
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn clamp_min_is_not_scalar() {
+        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
+        let min = -10.0;
+        let max = 1.0;
+
+        let func = ClampFunction;
+        let args = [
+            Arc::new(Float64Vector::from(input)) as _,
+            Arc::new(Float64Vector::from_vec(vec![min, min])) as _,
+            Arc::new(Float64Vector::from_vec(vec![max])) as _,
+        ];
+        let result = func.eval(FunctionContext::default(), args.as_slice());
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn clamp_no_max() {
+        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
+        let min = -10.0;
+
+        let func = ClampFunction;
+        let args = [
+            Arc::new(Float64Vector::from(input)) as _,
+            Arc::new(Float64Vector::from_vec(vec![min])) as _,
+        ];
+        let result = func.eval(FunctionContext::default(), args.as_slice());
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn clamp_on_string() {
+        let input = vec![Some("foo"), Some("foo"), Some("foo"), Some("foo")];
+
+        let func = ClampFunction;
+        let args = [
+            Arc::new(StringVector::from(input)) as _,
+            Arc::new(StringVector::from_vec(vec!["bar"])) as _,
+            Arc::new(StringVector::from_vec(vec!["baz"])) as _,
+        ];
+        let result = func.eval(FunctionContext::default(), args.as_slice());
+        assert!(result.is_err());
+    }
+}
--- a/src/common/function/src/scalars/timestamp.rs
+++ b/src/common/function/src/scalars/timestamp.rs
@@ -14,9 +14,11 @@

 use std::sync::Arc;
 mod greatest;
+mod to_timezone;
 mod to_unixtime;

 use greatest::GreatestFunction;
+use to_timezone::ToTimezoneFunction;
 use to_unixtime::ToUnixtimeFunction;

 use crate::function_registry::FunctionRegistry;
@@ -25,6 +27,7 @@ pub(crate) struct TimestampFunction;

 impl TimestampFunction {
    pub fn register(registry: &FunctionRegistry) {
+        registry.register(Arc::new(ToTimezoneFunction));
        registry.register(Arc::new(ToUnixtimeFunction));
        registry.register(Arc::new(GreatestFunction));
    }
--- a/src/common/function/src/scalars/timestamp/to_timezone.rs
+++ b/src/common/function/src/scalars/timestamp/to_timezone.rs
@@ -0,0 +1,260 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt;
+use std::sync::Arc;
+
+use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
+use common_query::prelude::Signature;
+use common_time::{Timestamp, Timezone};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::VectorRef;
+use datatypes::types::TimestampType;
+use datatypes::value::Value;
+use datatypes::vectors::{
+    StringVector, TimestampMicrosecondVector, TimestampMillisecondVector,
+    TimestampNanosecondVector, TimestampSecondVector, Vector,
+};
+use snafu::{ensure, OptionExt};
+
+use crate::function::{Function, FunctionContext};
+use crate::helper;
+
+#[derive(Clone, Debug, Default)]
+pub struct ToTimezoneFunction;
+
+const NAME: &str = "to_timezone";
+
+fn convert_to_timezone(arg: &str) -> Option<Timezone> {
+    Timezone::from_tz_string(arg).ok()
+}
+
+fn convert_to_timestamp(arg: &Value) -> Option<Timestamp> {
+    match arg {
+        Value::Timestamp(ts) => Some(*ts),
+        _ => None,
+    }
+}
+
+impl fmt::Display for ToTimezoneFunction {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "TO_TIMEZONE")
+    }
+}
+
+impl Function for ToTimezoneFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        // type checked by signature - MUST BE timestamp
+        Ok(input_types[0].clone())
+    }
+
+    fn signature(&self) -> Signature {
+        helper::one_of_sigs2(
+            vec![
+                ConcreteDataType::timestamp_second_datatype(),
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                ConcreteDataType::timestamp_microsecond_datatype(),
+                ConcreteDataType::timestamp_nanosecond_datatype(),
+            ],
+            vec![ConcreteDataType::string_datatype()],
+        )
+    }
+
+    fn eval(&self, _ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 2,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly 2, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+
+        // TODO: maybe support epoch timestamp? https://github.com/GreptimeTeam/greptimedb/issues/3477
+        let ts = columns[0].data_type().as_timestamp().with_context(|| {
+            UnsupportedInputDataTypeSnafu {
+                function: NAME,
+                datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
+            }
+        })?;
+        let array = columns[0].to_arrow_array();
+        let times = match ts {
+            TimestampType::Second(_) => {
+                let vector = TimestampSecondVector::try_from_arrow_array(array).unwrap();
+                (0..vector.len())
+                    .map(|i| convert_to_timestamp(&vector.get(i)))
+                    .collect::<Vec<_>>()
+            }
+            TimestampType::Millisecond(_) => {
+                let vector = TimestampMillisecondVector::try_from_arrow_array(array).unwrap();
+                (0..vector.len())
+                    .map(|i| convert_to_timestamp(&vector.get(i)))
+                    .collect::<Vec<_>>()
+            }
+            TimestampType::Microsecond(_) => {
+                let vector = TimestampMicrosecondVector::try_from_arrow_array(array).unwrap();
+                (0..vector.len())
+                    .map(|i| convert_to_timestamp(&vector.get(i)))
+                    .collect::<Vec<_>>()
+            }
+            TimestampType::Nanosecond(_) => {
+                let vector = TimestampNanosecondVector::try_from_arrow_array(array).unwrap();
+                (0..vector.len())
+                    .map(|i| convert_to_timestamp(&vector.get(i)))
+                    .collect::<Vec<_>>()
+            }
+        };
+
+        let tzs = {
+            let array = columns[1].to_arrow_array();
+            let vector = StringVector::try_from_arrow_array(&array)
+                .ok()
+                .with_context(|| UnsupportedInputDataTypeSnafu {
+                    function: NAME,
+                    datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
+                })?;
+            (0..vector.len())
+                .map(|i| convert_to_timezone(&vector.get(i).to_string()))
+                .collect::<Vec<_>>()
+        };
+
+        let result = times
+            .iter()
+            .zip(tzs.iter())
+            .map(|(time, tz)| match (time, tz) {
+                (Some(time), _) => Some(time.to_timezone_aware_string(tz.as_ref())),
+                _ => None,
+            })
+            .collect::<Vec<Option<String>>>();
+        Ok(Arc::new(StringVector::from(result)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use datatypes::scalars::ScalarVector;
+    use datatypes::timestamp::{
+        TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond,
+    };
+    use datatypes::vectors::StringVector;
+
+    use super::*;
+
+    #[test]
+    fn test_timestamp_to_timezone() {
+        let f = ToTimezoneFunction;
+        assert_eq!("to_timezone", f.name());
+
+        let results = vec![
+            Some("1969-12-31 19:00:01"),
+            None,
+            Some("1970-01-01 03:00:01"),
+            None,
+        ];
+        let times: Vec<Option<TimestampSecond>> = vec![
+            Some(TimestampSecond::new(1)),
+            None,
+            Some(TimestampSecond::new(1)),
+            None,
+        ];
+        let ts_vector: TimestampSecondVector =
+            TimestampSecondVector::from_owned_iterator(times.into_iter());
+        let tzs = vec![Some("America/New_York"), None, Some("Europe/Moscow"), None];
+        let args: Vec<VectorRef> = vec![
+            Arc::new(ts_vector),
+            Arc::new(StringVector::from(tzs.clone())),
+        ];
+        let vector = f.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(4, vector.len());
+        let expect_times: VectorRef = Arc::new(StringVector::from(results));
+        assert_eq!(expect_times, vector);
+
+        let results = vec![
+            Some("1969-12-31 19:00:00.001"),
+            None,
+            Some("1970-01-01 03:00:00.001"),
+            None,
+        ];
+        let times: Vec<Option<TimestampMillisecond>> = vec![
+            Some(TimestampMillisecond::new(1)),
+            None,
+            Some(TimestampMillisecond::new(1)),
+            None,
+        ];
+        let ts_vector: TimestampMillisecondVector =
+            TimestampMillisecondVector::from_owned_iterator(times.into_iter());
+        let args: Vec<VectorRef> = vec![
+            Arc::new(ts_vector),
+            Arc::new(StringVector::from(tzs.clone())),
+        ];
+        let vector = f.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(4, vector.len());
+        let expect_times: VectorRef = Arc::new(StringVector::from(results));
+        assert_eq!(expect_times, vector);
+
+        let results = vec![
+            Some("1969-12-31 19:00:00.000001"),
+            None,
+            Some("1970-01-01 03:00:00.000001"),
+            None,
+        ];
+        let times: Vec<Option<TimestampMicrosecond>> = vec![
+            Some(TimestampMicrosecond::new(1)),
+            None,
+            Some(TimestampMicrosecond::new(1)),
+            None,
+        ];
+        let ts_vector: TimestampMicrosecondVector =
+            TimestampMicrosecondVector::from_owned_iterator(times.into_iter());
+
+        let args: Vec<VectorRef> = vec![
+            Arc::new(ts_vector),
+            Arc::new(StringVector::from(tzs.clone())),
+        ];
+        let vector = f.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(4, vector.len());
+        let expect_times: VectorRef = Arc::new(StringVector::from(results));
+        assert_eq!(expect_times, vector);
+
+        let results = vec![
+            Some("1969-12-31 19:00:00.000000001"),
+            None,
+            Some("1970-01-01 03:00:00.000000001"),
+            None,
+        ];
+        let times: Vec<Option<TimestampNanosecond>> = vec![
+            Some(TimestampNanosecond::new(1)),
+            None,
+            Some(TimestampNanosecond::new(1)),
+            None,
+        ];
+        let ts_vector: TimestampNanosecondVector =
+            TimestampNanosecondVector::from_owned_iterator(times.into_iter());
+
+        let args: Vec<VectorRef> = vec![
+            Arc::new(ts_vector),
+            Arc::new(StringVector::from(tzs.clone())),
+        ];
+        let vector = f.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(4, vector.len());
+        let expect_times: VectorRef = Arc::new(StringVector::from(results));
+        assert_eq!(expect_times, vector);
+    }
+}
--- a/src/common/macro/src/admin_fn.rs
+++ b/src/common/macro/src/admin_fn.rs
@@ -32,7 +32,7 @@ macro_rules! ok {
    };
 }

-/// Internal util macro to to create an error.
+/// Internal util macro to create an error.
 macro_rules! error {
    ($span:expr, $msg: expr) => {
        Err(syn::Error::new($span, $msg))
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -67,6 +67,14 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Failed to execute {} txn operations via Etcd", max_operations))]
+    EtcdTxnFailed {
+        max_operations: usize,
+        #[snafu(source)]
+        error: etcd_client::Error,
+        location: Location,
+    },
+
    #[snafu(display("Failed to get sequence: {}", err_msg))]
    NextSequence { err_msg: String, location: Location },

@@ -400,6 +408,7 @@ impl ErrorExt for Error {
            IllegalServerState { .. }
            | EtcdTxnOpResponse { .. }
            | EtcdFailed { .. }
+            | EtcdTxnFailed { .. }
            | ConnectEtcd { .. } => StatusCode::Internal,

            SerdeJson { .. }
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -464,7 +464,7 @@ impl TableMetadataManager {
    pub fn max_logical_tables_per_batch(&self) -> usize {
        // The batch size is max_txn_size / 3 because the size of the `tables_data`
        // is 3 times the size of the `tables_data`.
-        self.kv_backend.max_txn_size() / 3
+        self.kv_backend.max_txn_ops() / 3
    }

    /// Creates metadata for multiple logical tables and return an error if different metadata exists.
@@ -860,6 +860,7 @@ mod tests {
    use bytes::Bytes;
    use common_time::util::current_time_millis;
    use futures::TryStreamExt;
+    use store_api::storage::RegionId;
    use table::metadata::{RawTableInfo, TableInfo};

    use super::datanode_table::DatanodeTableKey;
@@ -1056,6 +1057,36 @@ mod tests {
        );
    }

+    #[tokio::test]
+    async fn test_create_many_logical_tables_metadata() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let table_metadata_manager = TableMetadataManager::new(kv_backend);
+
+        let mut tables_data = vec![];
+        for i in 0..128 {
+            let table_id = i + 1;
+            let regin_number = table_id * 3;
+            let region_id = RegionId::new(table_id, regin_number);
+            let region_route = new_region_route(region_id.as_u64(), 2);
+            let region_routes = vec![region_route.clone()];
+            let table_info: RawTableInfo = test_utils::new_test_table_info_with_name(
+                table_id,
+                &format!("my_table_{}", table_id),
+                region_routes.iter().map(|r| r.region.id.region_number()),
+            )
+            .into();
+            let table_route_value = TableRouteValue::physical(region_routes.clone());
+
+            tables_data.push((table_info, table_route_value));
+        }
+
+        // creates metadata.
+        table_metadata_manager
+            .create_logical_tables_metadata(tables_data)
+            .await
+            .unwrap();
+    }
+
    #[tokio::test]
    async fn test_delete_table_metadata() {
        let mem_kv = Arc::new(MemoryKvBackend::default());
--- a/src/common/meta/src/key/test_utils.rs
+++ b/src/common/meta/src/key/test_utils.rs
@@ -19,8 +19,9 @@ use datatypes::schema::{ColumnSchema, SchemaBuilder};
 use store_api::storage::TableId;
 use table::metadata::{TableInfo, TableInfoBuilder, TableMetaBuilder};

-pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
+pub fn new_test_table_info_with_name<I: IntoIterator<Item = u32>>(
    table_id: TableId,
+    table_name: &str,
    region_numbers: I,
 ) -> TableInfo {
    let column_schemas = vec![
@@ -50,8 +51,14 @@ pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
    TableInfoBuilder::default()
        .table_id(table_id)
        .table_version(5)
-        .name("mytable")
+        .name(table_name)
        .meta(meta)
        .build()
        .unwrap()
 }
+pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
+    table_id: TableId,
+    region_numbers: I,
+) -> TableInfo {
+    new_test_table_info_with_name(table_id, "mytable", region_numbers)
+}
--- a/src/common/meta/src/kv_backend/chroot.rs
+++ b/src/common/meta/src/kv_backend/chroot.rs
@@ -45,6 +45,10 @@ impl TxnService for ChrootKvBackend {
        let txn_res = self.inner.txn(txn).await?;
        Ok(self.chroot_txn_response(txn_res))
    }
+
+    fn max_txn_ops(&self) -> usize {
+        self.inner.max_txn_ops()
+    }
 }

 #[async_trait::async_trait]
--- a/src/common/meta/src/kv_backend/etcd.rs
+++ b/src/common/meta/src/kv_backend/etcd.rs
@@ -33,12 +33,6 @@ use crate::rpc::store::{
 };
 use crate::rpc::KeyValue;

-// Maximum number of operations permitted in a transaction.
-// The etcd default configuration's `--max-txn-ops` is 128.
-//
-// For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
-const MAX_TXN_SIZE: usize = 128;
-
 fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {
    let (key, value) = kv.into_key_value();
    KeyValue { key, value }
@@ -46,10 +40,15 @@ fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {

 pub struct EtcdStore {
    client: Client,
+    // Maximum number of operations permitted in a transaction.
+    // The etcd default configuration's `--max-txn-ops` is 128.
+    //
+    // For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
+    max_txn_ops: usize,
 }

 impl EtcdStore {
-    pub async fn with_endpoints<E, S>(endpoints: S) -> Result<KvBackendRef>
+    pub async fn with_endpoints<E, S>(endpoints: S, max_txn_ops: usize) -> Result<KvBackendRef>
    where
        E: AsRef<str>,
        S: AsRef<[E]>,
@@ -58,16 +57,19 @@ impl EtcdStore {
            .await
            .context(error::ConnectEtcdSnafu)?;

-        Ok(Self::with_etcd_client(client))
+        Ok(Self::with_etcd_client(client, max_txn_ops))
    }

-    pub fn with_etcd_client(client: Client) -> KvBackendRef {
-        Arc::new(Self { client })
+    pub fn with_etcd_client(client: Client, max_txn_ops: usize) -> KvBackendRef {
+        Arc::new(Self {
+            client,
+            max_txn_ops,
+        })
    }

    async fn do_multi_txn(&self, txn_ops: Vec<TxnOp>) -> Result<Vec<TxnResponse>> {
-        let max_txn_size = self.max_txn_size();
-        if txn_ops.len() < max_txn_size {
+        let max_txn_ops = self.max_txn_ops();
+        if txn_ops.len() < max_txn_ops {
            // fast path
            let _timer = METRIC_META_TXN_REQUEST
                .with_label_values(&["etcd", "txn"])
@@ -83,7 +85,7 @@ impl EtcdStore {
        }

        let txns = txn_ops
-            .chunks(max_txn_size)
+            .chunks(max_txn_ops)
            .map(|part| async move {
                let _timer = METRIC_META_TXN_REQUEST
                    .with_label_values(&["etcd", "txn"])
@@ -311,18 +313,20 @@ impl TxnService for EtcdStore {
            .with_label_values(&["etcd", "txn"])
            .start_timer();

+        let max_operations = txn.max_operations();
+
        let etcd_txn: Txn = txn.into();
        let txn_res = self
            .client
            .kv_client()
            .txn(etcd_txn)
            .await
-            .context(error::EtcdFailedSnafu)?;
+            .context(error::EtcdTxnFailedSnafu { max_operations })?;
        txn_res.try_into()
    }

-    fn max_txn_size(&self) -> usize {
-        MAX_TXN_SIZE
+    fn max_txn_ops(&self) -> usize {
+        self.max_txn_ops
    }
 }

--- a/src/common/meta/src/kv_backend/memory.rs
+++ b/src/common/meta/src/kv_backend/memory.rs
@@ -323,6 +323,10 @@ impl<T: ErrorExt + Send + Sync> TxnService for MemoryKvBackend<T> {
            responses,
        })
    }
+
+    fn max_txn_ops(&self) -> usize {
+        usize::MAX
+    }
 }

 impl<T: ErrorExt + Send + Sync + 'static> ResettableKvBackend for MemoryKvBackend<T> {
--- a/src/common/meta/src/kv_backend/txn.rs
+++ b/src/common/meta/src/kv_backend/txn.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::cmp::max;
+
 use common_error::ext::ErrorExt;

 use crate::rpc::store::{DeleteRangeResponse, PutResponse, RangeResponse};
@@ -27,8 +29,8 @@ pub trait TxnService: Sync + Send {
    }

    /// Maximum number of operations permitted in a transaction.
-    fn max_txn_size(&self) -> usize {
-        usize::MAX
+    fn max_txn_ops(&self) -> usize {
+        unimplemented!("txn is not implemented")
    }
 }

@@ -192,6 +194,12 @@ impl Txn {
        self.req.failure = operations.into();
        self
    }
+
+    #[inline]
+    pub fn max_operations(&self) -> usize {
+        let opc = max(self.req.compare.len(), self.req.success.len());
+        max(opc, self.req.failure.len())
+    }
 }

 impl From<Txn> for TxnRequest {
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -152,7 +152,7 @@ impl Runner {
            guard.key_guards.push(key_guard);
        }

-        // Execute the procedure. We need to release the lock whenever the the execution
+        // Execute the procedure. We need to release the lock whenever the execution
        // is successful or fail.
        self.execute_procedure_in_loop().await;

--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -30,38 +30,87 @@ pub mod prelude;
 mod signature;
 use sqlparser_derive::{Visit, VisitMut};

-// sql output
-pub enum Output {
+/// new Output struct with output data(previously Output) and output meta
+#[derive(Debug)]
+pub struct Output {
+    pub data: OutputData,
+    pub meta: OutputMeta,
+}
+
+/// Original Output struct
+/// carrying result data to response/client/user interface
+pub enum OutputData {
    AffectedRows(usize),
    RecordBatches(RecordBatches),
-    Stream(SendableRecordBatchStream, Option<Arc<dyn PhysicalPlan>>),
+    Stream(SendableRecordBatchStream),
+}
+
+/// OutputMeta stores meta information produced/generated during the execution
+#[derive(Debug, Default)]
+pub struct OutputMeta {
+    /// May exist for query output. One can retrieve execution metrics from this plan.
+    pub plan: Option<Arc<dyn PhysicalPlan>>,
+    pub cost: usize,
 }

 impl Output {
-    // helper function to build original `Output::Stream`
-    pub fn new_stream(stream: SendableRecordBatchStream) -> Self {
-        Output::Stream(stream, None)
+    pub fn new_with_affected_rows(affected_rows: usize) -> Self {
+        Self {
+            data: OutputData::AffectedRows(affected_rows),
+            meta: Default::default(),
+        }
+    }
+
+    pub fn new_with_record_batches(recordbatches: RecordBatches) -> Self {
+        Self {
+            data: OutputData::RecordBatches(recordbatches),
+            meta: Default::default(),
+        }
+    }
+
+    pub fn new_with_stream(stream: SendableRecordBatchStream) -> Self {
+        Self {
+            data: OutputData::Stream(stream),
+            meta: Default::default(),
+        }
+    }
+
+    pub fn new(data: OutputData, meta: OutputMeta) -> Self {
+        Self { data, meta }
    }
 }

-impl Debug for Output {
+impl Debug for OutputData {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
-            Output::AffectedRows(rows) => write!(f, "Output::AffectedRows({rows})"),
-            Output::RecordBatches(recordbatches) => {
-                write!(f, "Output::RecordBatches({recordbatches:?})")
+            OutputData::AffectedRows(rows) => write!(f, "OutputData::AffectedRows({rows})"),
+            OutputData::RecordBatches(recordbatches) => {
+                write!(f, "OutputData::RecordBatches({recordbatches:?})")
            }
-            Output::Stream(_, df) => {
-                if df.is_some() {
-                    write!(f, "Output::Stream(<stream>, Some<physical_plan>)")
-                } else {
-                    write!(f, "Output::Stream(<stream>)")
-                }
+            OutputData::Stream(_) => {
+                write!(f, "OutputData::Stream(<stream>)")
            }
        }
    }
 }

+impl OutputMeta {
+    pub fn new(plan: Option<Arc<dyn PhysicalPlan>>, cost: usize) -> Self {
+        Self { plan, cost }
+    }
+
+    pub fn new_with_plan(plan: Arc<dyn PhysicalPlan>) -> Self {
+        Self {
+            plan: Some(plan),
+            cost: 0,
+        }
+    }
+
+    pub fn new_with_cost(cost: usize) -> Self {
+        Self { plan: None, cost }
+    }
+}
+
 pub use datafusion::physical_plan::ExecutionPlan as DfPhysicalPlan;

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -32,7 +32,7 @@ use snafu::ResultExt;

 use crate::error::{self, Result};
 use crate::{
-    DfRecordBatch, DfSendableRecordBatchStream, RecordBatch, RecordBatchStream,
+    DfRecordBatch, DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream,
    SendableRecordBatchStream, Stream,
 };

@@ -228,6 +228,10 @@ impl RecordBatchStream for RecordBatchStreamAdapter {
            Metrics::Unavailable | Metrics::Unresolved(_) => None,
        }
    }
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        None
+    }
 }

 impl Stream for RecordBatchStreamAdapter {
@@ -316,6 +320,14 @@ impl RecordBatchStream for AsyncRecordBatchStreamAdapter {
    fn schema(&self) -> SchemaRef {
        self.schema.clone()
    }
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        None
+    }
+
+    fn metrics(&self) -> Option<RecordBatchMetrics> {
+        None
+    }
 }

 impl Stream for AsyncRecordBatchStreamAdapter {
@@ -375,6 +387,14 @@ mod test {
            fn schema(&self) -> SchemaRef {
                unimplemented!()
            }
+
+            fn output_ordering(&self) -> Option<&[OrderOption]> {
+                None
+            }
+
+            fn metrics(&self) -> Option<RecordBatchMetrics> {
+                None
+            }
        }

        impl Stream for MaybeErrorRecordBatchStream {
--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -39,13 +39,9 @@ use snafu::{ensure, ResultExt};
 pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
    fn schema(&self) -> SchemaRef;

-    fn output_ordering(&self) -> Option<&[OrderOption]> {
-        None
-    }
+    fn output_ordering(&self) -> Option<&[OrderOption]>;

-    fn metrics(&self) -> Option<RecordBatchMetrics> {
-        None
-    }
+    fn metrics(&self) -> Option<RecordBatchMetrics>;
 }

 pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;
@@ -74,6 +70,14 @@ impl RecordBatchStream for EmptyRecordBatchStream {
    fn schema(&self) -> SchemaRef {
        self.schema.clone()
    }
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        None
+    }
+
+    fn metrics(&self) -> Option<RecordBatchMetrics> {
+        None
+    }
 }

 impl Stream for EmptyRecordBatchStream {
@@ -192,6 +196,14 @@ impl RecordBatchStream for SimpleRecordBatchStream {
    fn schema(&self) -> SchemaRef {
        self.inner.schema()
    }
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        None
+    }
+
+    fn metrics(&self) -> Option<RecordBatchMetrics> {
+        None
+    }
 }

 impl Stream for SimpleRecordBatchStream {
--- a/src/common/recordbatch/src/util.rs
+++ b/src/common/recordbatch/src/util.rs
@@ -41,7 +41,8 @@ mod tests {
    use futures::Stream;

    use super::*;
-    use crate::RecordBatchStream;
+    use crate::adapter::RecordBatchMetrics;
+    use crate::{OrderOption, RecordBatchStream};

    struct MockRecordBatchStream {
        batch: Option<RecordBatch>,
@@ -52,6 +53,14 @@ mod tests {
        fn schema(&self) -> SchemaRef {
            self.schema.clone()
        }
+
+        fn output_ordering(&self) -> Option<&[OrderOption]> {
+            None
+        }
+
+        fn metrics(&self) -> Option<RecordBatchMetrics> {
+            None
+        }
    }

    impl Stream for MockRecordBatchStream {
--- a/src/common/telemetry/src/lib.rs
+++ b/src/common/telemetry/src/lib.rs
@@ -12,11 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#![feature(let_chains)]
+
 pub mod logging;
 mod macros;
 pub mod metric;
 mod panic_hook;
 pub mod tracing_context;
+mod tracing_sampler;

 pub use logging::{init_default_ut_logging, init_global_logging};
 pub use metric::dump_metrics;
--- a/src/common/telemetry/src/logging.rs
+++ b/src/common/telemetry/src/logging.rs
@@ -31,6 +31,7 @@ use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;
 use tracing_subscriber::{filter, EnvFilter, Registry};

+use crate::tracing_sampler::{create_sampler, TracingSampleOptions};
 pub use crate::{debug, error, info, trace, warn};

 const DEFAULT_OTLP_ENDPOINT: &str = "http://localhost:4317";
@@ -42,7 +43,7 @@ pub struct LoggingOptions {
    pub level: Option<String>,
    pub enable_otlp_tracing: bool,
    pub otlp_endpoint: Option<String>,
-    pub tracing_sample_ratio: Option<f64>,
+    pub tracing_sample_ratio: Option<TracingSampleOptions>,
    pub append_stdout: bool,
 }

@@ -176,8 +177,10 @@ pub fn init_global_logging(
        .expect("error parsing log level string");
    let sampler = opts
        .tracing_sample_ratio
-        .map(Sampler::TraceIdRatioBased)
-        .unwrap_or(Sampler::AlwaysOn);
+        .as_ref()
+        .map(create_sampler)
+        .map(Sampler::ParentBased)
+        .unwrap_or(Sampler::ParentBased(Box::new(Sampler::AlwaysOn)));
    // Must enable 'tokio_unstable' cfg to use this feature.
    // For example: `RUSTFLAGS="--cfg tokio_unstable" cargo run -F common-telemetry/console -- standalone start`
    #[cfg(feature = "tokio-console")]
--- a/src/common/telemetry/src/tracing_sampler.rs
+++ b/src/common/telemetry/src/tracing_sampler.rs
@@ -0,0 +1,176 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+
+use opentelemetry::trace::{
+    Link, SamplingDecision, SamplingResult, SpanKind, TraceContextExt, TraceId, TraceState,
+};
+use opentelemetry::KeyValue;
+use opentelemetry_sdk::trace::{Sampler, ShouldSample};
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(default)]
+pub struct TracingSampleOptions {
+    pub default_ratio: f64,
+    pub rules: Vec<TracingSampleRule>,
+}
+
+impl Default for TracingSampleOptions {
+    fn default() -> Self {
+        Self {
+            default_ratio: 1.0,
+            rules: vec![],
+        }
+    }
+}
+
+/// Determine the sampling rate of a span according to the `rules` provided in `RuleSampler`.
+/// For spans that do not hit any `rules`, the `default_ratio` is used.
+#[derive(Clone, Default, Debug, Serialize, Deserialize)]
+#[serde(default)]
+pub struct TracingSampleRule {
+    pub protocol: String,
+    pub request_types: HashSet<String>,
+    pub ratio: f64,
+}
+
+impl TracingSampleRule {
+    pub fn match_rule(&self, protocol: &str, request_type: Option<&str>) -> Option<f64> {
+        if protocol == self.protocol {
+            if self.request_types.is_empty() {
+                Some(self.ratio)
+            } else if let Some(t) = request_type
+                && self.request_types.contains(t)
+            {
+                Some(self.ratio)
+            } else {
+                None
+            }
+        } else {
+            None
+        }
+    }
+}
+
+impl PartialEq for TracingSampleOptions {
+    fn eq(&self, other: &Self) -> bool {
+        self.default_ratio == other.default_ratio && self.rules == other.rules
+    }
+}
+impl PartialEq for TracingSampleRule {
+    fn eq(&self, other: &Self) -> bool {
+        self.protocol == other.protocol
+            && self.request_types == other.request_types
+            && self.ratio == other.ratio
+    }
+}
+
+impl Eq for TracingSampleOptions {}
+impl Eq for TracingSampleRule {}
+
+pub fn create_sampler(opt: &TracingSampleOptions) -> Box<dyn ShouldSample> {
+    if opt.rules.is_empty() {
+        Box::new(Sampler::TraceIdRatioBased(opt.default_ratio))
+    } else {
+        Box::new(opt.clone())
+    }
+}
+
+impl ShouldSample for TracingSampleOptions {
+    fn should_sample(
+        &self,
+        parent_context: Option<&opentelemetry::Context>,
+        trace_id: TraceId,
+        _name: &str,
+        _span_kind: &SpanKind,
+        attributes: &[KeyValue],
+        _links: &[Link],
+    ) -> SamplingResult {
+        let (mut protocol, mut request_type) = (None, None);
+        for kv in attributes {
+            match kv.key.as_str() {
+                "protocol" => protocol = Some(kv.value.as_str()),
+                "request_type" => request_type = Some(kv.value.as_str()),
+                _ => (),
+            }
+        }
+        let ratio = protocol
+            .and_then(|p| {
+                self.rules
+                    .iter()
+                    .find_map(|rule| rule.match_rule(p.as_ref(), request_type.as_deref()))
+            })
+            .unwrap_or(self.default_ratio);
+        SamplingResult {
+            decision: sample_based_on_probability(ratio, trace_id),
+            // No extra attributes ever set by the SDK samplers.
+            attributes: Vec::new(),
+            // all sampler in SDK will not modify trace state.
+            trace_state: match parent_context {
+                Some(ctx) => ctx.span().span_context().trace_state().clone(),
+                None => TraceState::default(),
+            },
+        }
+    }
+}
+
+/// The code here mainly refers to the relevant implementation of
+/// [opentelemetry](https://github.com/open-telemetry/opentelemetry-rust/blob/ef4701055cc39d3448d5e5392812ded00cdd4476/opentelemetry-sdk/src/trace/sampler.rs#L229),
+/// and determines whether the span needs to be collected based on the `TraceId` and sampling rate (i.e. `prob`).
+fn sample_based_on_probability(prob: f64, trace_id: TraceId) -> SamplingDecision {
+    if prob >= 1.0 {
+        SamplingDecision::RecordAndSample
+    } else {
+        let prob_upper_bound = (prob.max(0.0) * (1u64 << 63) as f64) as u64;
+        let bytes = trace_id.to_bytes();
+        let (_, low) = bytes.split_at(8);
+        let trace_id_low = u64::from_be_bytes(low.try_into().unwrap());
+        let rnd_from_trace_id = trace_id_low >> 1;
+
+        if rnd_from_trace_id < prob_upper_bound {
+            SamplingDecision::RecordAndSample
+        } else {
+            SamplingDecision::Drop
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::HashSet;
+
+    use crate::tracing_sampler::TracingSampleRule;
+
+    #[test]
+    fn test_rule() {
+        let rule = TracingSampleRule {
+            protocol: "http".to_string(),
+            request_types: HashSet::new(),
+            ratio: 1.0,
+        };
+        assert_eq!(rule.match_rule("not_http", None), None);
+        assert_eq!(rule.match_rule("http", None), Some(1.0));
+        assert_eq!(rule.match_rule("http", Some("abc")), Some(1.0));
+        let rule1 = TracingSampleRule {
+            protocol: "http".to_string(),
+            request_types: HashSet::from(["mysql".to_string()]),
+            ratio: 1.0,
+        };
+        assert_eq!(rule1.match_rule("http", None), None);
+        assert_eq!(rule1.match_rule("http", Some("abc")), None);
+        assert_eq!(rule1.match_rule("http", Some("mysql")), Some(1.0));
+    }
+}
--- a/src/common/test-util/src/recordbatch.rs
+++ b/src/common/test-util/src/recordbatch.rs
@@ -13,7 +13,7 @@
 // limitations under the License.

 use client::Database;
-use common_query::Output;
+use common_query::OutputData;
 use common_recordbatch::util;

 pub enum ExpectedOutput<'a> {
@@ -23,22 +23,24 @@ pub enum ExpectedOutput<'a> {

 pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) {
    let output = db.sql(sql).await.unwrap();
+    let output = output.data;
+
    match (&output, expected) {
-        (Output::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
+        (OutputData::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
            assert_eq!(*x, y, "actual: \n{}", x)
        }
-        (Output::RecordBatches(_), ExpectedOutput::QueryResult(x))
-        | (Output::Stream(_, _), ExpectedOutput::QueryResult(x)) => {
+        (OutputData::RecordBatches(_), ExpectedOutput::QueryResult(x))
+        | (OutputData::Stream(_), ExpectedOutput::QueryResult(x)) => {
            check_output_stream(output, x).await
        }
        _ => panic!(),
    }
 }

-pub async fn check_output_stream(output: Output, expected: &str) {
+pub async fn check_output_stream(output: OutputData, expected: &str) {
    let recordbatches = match output {
-        Output::Stream(stream, _) => util::collect_batches(stream).await.unwrap(),
-        Output::RecordBatches(recordbatches) => recordbatches,
+        OutputData::Stream(stream) => util::collect_batches(stream).await.unwrap(),
+        OutputData::RecordBatches(recordbatches) => recordbatches,
        _ => unreachable!(),
    };
    let pretty_print = recordbatches.pretty_print().unwrap();
--- a/src/common/time/src/timestamp.rs
+++ b/src/common/time/src/timestamp.rs
@@ -36,7 +36,7 @@ use crate::{error, Interval};
 /// - for [TimeUnit::Second]: [-262144-01-01 00:00:00, +262143-12-31 23:59:59]
 /// - for [TimeUnit::Millisecond]: [-262144-01-01 00:00:00.000, +262143-12-31 23:59:59.999]
 /// - for [TimeUnit::Microsecond]: [-262144-01-01 00:00:00.000000, +262143-12-31 23:59:59.999999]
-/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145225, 2262-04-11 23:47:16.854775807]
+/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145224192, 2262-04-11 23:47:16.854775807]
 ///
 /// # Note:
 /// For values out of range, you can still store these timestamps, but while performing arithmetic
@@ -187,28 +187,28 @@ impl Timestamp {
        Self { unit, value }
    }

-    pub fn new_second(value: i64) -> Self {
+    pub const fn new_second(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Second,
        }
    }

-    pub fn new_millisecond(value: i64) -> Self {
+    pub const fn new_millisecond(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Millisecond,
        }
    }

-    pub fn new_microsecond(value: i64) -> Self {
+    pub const fn new_microsecond(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Microsecond,
        }
    }

-    pub fn new_nanosecond(value: i64) -> Self {
+    pub const fn new_nanosecond(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Nanosecond,
@@ -281,8 +281,26 @@ impl Timestamp {
                .and_then(|v| v.checked_add(micros as i64))
                .map(Timestamp::new_microsecond)
        } else {
+            // Refer to <https://github.com/chronotope/chrono/issues/1289>
+            //
+            // subsec nanos are always non-negative, however the timestamp itself (both in seconds and in nanos) can be
+            // negative. Now i64::MIN is NOT dividable by 1_000_000_000, so
+            //
+            //   (sec * 1_000_000_000) + nsec
+            //
+            // may underflow (even when in theory we COULD represent the datetime as i64) because we add the non-negative
+            // nanos AFTER the multiplication. This is fixed by converting the negative case to
+            //
+            //   ((sec + 1) * 1_000_000_000) + (nsec - 1_000_000_000)
+            let mut sec = sec;
+            let mut nsec = nsec as i64;
+            if sec < 0 && nsec > 0 {
+                nsec -= 1_000_000_000;
+                sec += 1;
+            }
+
            sec.checked_mul(1_000_000_000)
-                .and_then(|v| v.checked_add(nsec as i64))
+                .and_then(|v| v.checked_add(nsec))
                .map(Timestamp::new_nanosecond)
        }
    }
@@ -425,6 +443,20 @@ impl Timestamp {
    }
 }

+impl Timestamp {
+    pub const MIN_SECOND: Self = Self::new_second(-8_334_601_228_800);
+    pub const MAX_SECOND: Self = Self::new_second(8_210_266_876_799);
+
+    pub const MIN_MILLISECOND: Self = Self::new_millisecond(-8_334_601_228_800_000);
+    pub const MAX_MILLISECOND: Self = Self::new_millisecond(8_210_266_876_799_999);
+
+    pub const MIN_MICROSECOND: Self = Self::new_microsecond(-8_334_601_228_800_000_000);
+    pub const MAX_MICROSECOND: Self = Self::new_microsecond(8_210_266_876_799_999_999);
+
+    pub const MIN_NANOSECOND: Self = Self::new_nanosecond(i64::MIN);
+    pub const MAX_NANOSECOND: Self = Self::new_nanosecond(i64::MAX);
+}
+
 /// Converts the naive datetime (which has no specific timezone) to a
 /// nanosecond epoch timestamp in UTC.
 fn naive_datetime_to_timestamp(
@@ -586,6 +618,7 @@ impl Hash for Timestamp {
 mod tests {
    use std::collections::hash_map::DefaultHasher;

+    use chrono_tz::Tz;
    use rand::Rng;
    use serde_json::Value;

@@ -1297,7 +1330,7 @@ mod tests {
            "+262142-12-31 23:59:59Z",
            "+262142-12-31 23:59:59.999Z",
            "+262142-12-31 23:59:59.999999Z",
-            "1677-09-21 00:12:43.145225Z",
+            "1677-09-21 00:12:43.145224192Z",
            "2262-04-11 23:47:16.854775807Z",
            "+100000-01-01 00:00:01.5Z",
        ];
@@ -1306,4 +1339,47 @@ mod tests {
            Timestamp::from_str_utc(s).unwrap();
        }
    }
+
+    #[test]
+    fn test_min_nanos_roundtrip() {
+        let (sec, nsec) = Timestamp::MIN_NANOSECOND.split();
+        let ts = Timestamp::from_splits(sec, nsec).unwrap();
+        assert_eq!(Timestamp::MIN_NANOSECOND, ts);
+    }
+
+    #[test]
+    fn test_timestamp_bound_format() {
+        assert_eq!(
+            "1677-09-21 00:12:43.145224192",
+            Timestamp::MIN_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+        assert_eq!(
+            "2262-04-11 23:47:16.854775807",
+            Timestamp::MAX_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+        assert_eq!(
+            "-262143-01-01 00:00:00",
+            Timestamp::MIN_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+        assert_eq!(
+            "+262142-12-31 23:59:59.999999",
+            Timestamp::MAX_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+        assert_eq!(
+            "-262143-01-01 00:00:00",
+            Timestamp::MIN_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+        assert_eq!(
+            "+262142-12-31 23:59:59.999",
+            Timestamp::MAX_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+        assert_eq!(
+            "-262143-01-01 00:00:00",
+            Timestamp::MIN_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+        assert_eq!(
+            "+262142-12-31 23:59:59",
+            Timestamp::MAX_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
+        );
+    }
 }
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -27,7 +27,7 @@ use common_error::ext::BoxedError;
 use common_error::status_code::StatusCode;
 use common_query::logical_plan::Expr;
 use common_query::physical_plan::DfPhysicalPlanAdapter;
-use common_query::{DfPhysicalPlan, Output};
+use common_query::{DfPhysicalPlan, OutputData};
 use common_recordbatch::SendableRecordBatchStream;
 use common_runtime::Runtime;
 use common_telemetry::tracing::{self, info_span};
@@ -651,11 +651,11 @@ impl RegionServerInner {
            .await
            .context(ExecuteLogicalPlanSnafu)?;

-        match result {
-            Output::AffectedRows(_) | Output::RecordBatches(_) => {
+        match result.data {
+            OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => {
                UnsupportedOutputSnafu { expected: "stream" }.fail()
            }
-            Output::Stream(stream, _) => Ok(stream),
+            OutputData::Stream(stream) => Ok(stream),
        }
    }

--- a/src/datatypes/src/value.rs
+++ b/src/datatypes/src/value.rs
@@ -370,6 +370,36 @@ impl Value {
    }
 }

+pub trait TryAsPrimitive<T: LogicalPrimitiveType> {
+    fn try_as_primitive(&self) -> Option<T::Native>;
+}
+
+macro_rules! impl_try_as_primitive {
+    ($Type: ident, $Variant: ident) => {
+        impl TryAsPrimitive<crate::types::$Type> for Value {
+            fn try_as_primitive(
+                &self,
+            ) -> Option<<crate::types::$Type as crate::types::LogicalPrimitiveType>::Native> {
+                match self {
+                    Value::$Variant(v) => Some((*v).into()),
+                    _ => None,
+                }
+            }
+        }
+    };
+}
+
+impl_try_as_primitive!(Int8Type, Int8);
+impl_try_as_primitive!(Int16Type, Int16);
+impl_try_as_primitive!(Int32Type, Int32);
+impl_try_as_primitive!(Int64Type, Int64);
+impl_try_as_primitive!(UInt8Type, UInt8);
+impl_try_as_primitive!(UInt16Type, UInt16);
+impl_try_as_primitive!(UInt32Type, UInt32);
+impl_try_as_primitive!(UInt64Type, UInt64);
+impl_try_as_primitive!(Float32Type, Float32);
+impl_try_as_primitive!(Float64Type, Float64);
+
 pub fn to_null_scalar_value(output_type: &ConcreteDataType) -> Result<ScalarValue> {
    Ok(match output_type {
        ConcreteDataType::Null(_) => ScalarValue::Null,
@@ -2387,4 +2417,12 @@ mod tests {
        );
        check_value_ref_size_eq(&ValueRef::Decimal128(Decimal128::new(1234, 3, 1)), 32)
    }
+
+    #[test]
+    fn test_incorrect_default_value_issue_3479() {
+        let value = OrderedF64::from(0.047318541668048164);
+        let serialized = serde_json::to_string(&value).unwrap();
+        let deserialized: OrderedF64 = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(value, deserialized);
+    }
 }
--- a/src/file-engine/src/query.rs
+++ b/src/file-engine/src/query.rs
@@ -22,8 +22,9 @@ use std::task::{Context, Poll};
 use common_datasource::object_store::build_backend;
 use common_error::ext::BoxedError;
 use common_query::prelude::Expr;
+use common_recordbatch::adapter::RecordBatchMetrics;
 use common_recordbatch::error::{CastVectorSnafu, ExternalSnafu, Result as RecordBatchResult};
-use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
+use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
 use datafusion::logical_expr::utils as df_logical_expr_utils;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
@@ -151,6 +152,14 @@ impl RecordBatchStream for FileToScanRegionStream {
    fn schema(&self) -> SchemaRef {
        self.scan_schema.clone()
    }
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        None
+    }
+
+    fn metrics(&self) -> Option<RecordBatchMetrics> {
+        None
+    }
 }

 impl Stream for FileToScanRegionStream {
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -18,6 +18,7 @@ common-query.workspace = true
 common-telemetry.workspace = true
 common-time.workspace = true
 datatypes.workspace = true
+enum_dispatch = "0.3"
 hydroflow = "0.5.0"
 itertools.workspace = true
 num-traits = "0.2"
@@ -27,3 +28,6 @@ session.workspace = true
 snafu.workspace = true
 tokio.workspace = true
 tonic.workspace = true
+
+[dev-dependencies]
+serde_json = "1.0"
--- a/src/flow/src/expr.rs
+++ b/src/flow/src/expr.rs
@@ -24,5 +24,6 @@ mod scalar;
 pub(crate) use error::{EvalError, InvalidArgumentSnafu, OptimizeSnafu};
 pub(crate) use func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
 pub(crate) use id::{GlobalId, Id, LocalId};
+pub(crate) use linear::{MapFilterProject, MfpPlan, SafeMfpPlan};
 pub(crate) use relation::{AggregateExpr, AggregateFunc};
 pub(crate) use scalar::ScalarExpr;
--- a/src/flow/src/expr/error.rs
+++ b/src/flow/src/expr/error.rs
@@ -61,4 +61,7 @@ pub enum EvalError {

    #[snafu(display("Unsupported temporal filter: {reason}"))]
    UnsupportedTemporalFilter { reason: String, location: Location },
+
+    #[snafu(display("Overflowed during evaluation"))]
+    Overflow { location: Location },
 }
--- a/src/flow/src/expr/linear.rs
+++ b/src/flow/src/expr/linear.rs
@@ -45,7 +45,7 @@ use crate::repr::{self, value_to_internal_ts, Diff, Row};
 /// expressions in `self.expressions`, even though this is not something
 /// we can directly evaluate. The plan creation methods will defensively
 /// ensure that the right thing happens.
-#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct MapFilterProject {
    /// A sequence of expressions that should be appended to the row.
    ///
@@ -415,7 +415,7 @@ impl MapFilterProject {
 }

 /// A wrapper type which indicates it is safe to simply evaluate all expressions.
-#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
 pub struct SafeMfpPlan {
    pub(crate) mfp: MapFilterProject,
 }
@@ -800,7 +800,7 @@ mod test {
            .unwrap();
        // only retain sum result
        let mfp = mfp.project(vec![4]).unwrap();
-        // accept only if if the sum is greater than 10
+        // accept only if the sum is greater than 10
        let mfp = mfp
            .filter(vec![ScalarExpr::Column(0).call_binary(
                ScalarExpr::Literal(Value::from(10i32), ConcreteDataType::int32_datatype()),
--- a/src/flow/src/expr/relation.rs
+++ b/src/flow/src/expr/relation.rs
@@ -21,7 +21,7 @@ mod accum;
 mod func;

 /// Describes an aggregation expression.
-#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct AggregateExpr {
    /// Names the aggregation function.
    pub func: AggregateFunc,
--- a/src/flow/src/expr/relation/accum.rs
+++ b/src/flow/src/expr/relation/accum.rs
@@ -14,7 +14,10 @@

 //! Accumulators for aggregate functions that's is accumulatable. i.e. sum/count
 //!
-//! Currently support sum, count, any, all
+//! Accumulator will only be restore from row and being updated every time dataflow need process a new batch of rows.
+//! So the overhead is acceptable.
+//!
+//! Currently support sum, count, any, all and min/max(with one caveat that min/max can't support delete with aggregate).

 use std::fmt::Display;

@@ -22,13 +25,506 @@ use common_decimal::Decimal128;
 use common_time::{Date, DateTime};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::value::{OrderedF32, OrderedF64, OrderedFloat, Value};
+use enum_dispatch::enum_dispatch;
 use hydroflow::futures::stream::Concat;
 use serde::{Deserialize, Serialize};
+use snafu::ensure;

-use crate::expr::error::{InternalSnafu, TryFromValueSnafu, TypeMismatchSnafu};
+use crate::expr::error::{InternalSnafu, OverflowSnafu, TryFromValueSnafu, TypeMismatchSnafu};
+use crate::expr::relation::func::GenericFn;
 use crate::expr::{AggregateFunc, EvalError};
 use crate::repr::Diff;

+/// Accumulates values for the various types of accumulable aggregations.
+#[enum_dispatch]
+pub trait Accumulator: Sized {
+    fn into_state(self) -> Vec<Value>;
+    fn update(
+        &mut self,
+        aggr_fn: &AggregateFunc,
+        value: Value,
+        diff: Diff,
+    ) -> Result<(), EvalError>;
+
+    fn update_batch<I>(&mut self, aggr_fn: &AggregateFunc, value_diffs: I) -> Result<(), EvalError>
+    where
+        I: IntoIterator<Item = (Value, Diff)>,
+    {
+        for (v, d) in value_diffs {
+            self.update(aggr_fn, v, d)?;
+        }
+        Ok(())
+    }
+
+    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError>;
+}
+
+/// Bool accumulator, used for `Any` `All` `Max/MinBool`
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct Bool {
+    /// The number of `true` values observed.
+    trues: Diff,
+    /// The number of `false` values observed.
+    falses: Diff,
+}
+
+impl TryFrom<Vec<Value>> for Bool {
+    type Error = EvalError;
+
+    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
+        ensure!(
+            state.len() == 2,
+            InternalSnafu {
+                reason: "Bool Accumulator state should have 2 values",
+            }
+        );
+
+        let mut iter = state.into_iter();
+
+        Ok(Self {
+            trues: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+            falses: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+        })
+    }
+}
+
+impl Accumulator for Bool {
+    fn into_state(self) -> Vec<Value> {
+        vec![self.trues.into(), self.falses.into()]
+    }
+
+    /// Null values are ignored
+    fn update(
+        &mut self,
+        aggr_fn: &AggregateFunc,
+        value: Value,
+        diff: Diff,
+    ) -> Result<(), EvalError> {
+        ensure!(
+            matches!(
+                aggr_fn,
+                AggregateFunc::Any
+                    | AggregateFunc::All
+                    | AggregateFunc::MaxBool
+                    | AggregateFunc::MinBool
+            ),
+            InternalSnafu {
+                reason: format!(
+                    "Bool Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+        );
+
+        match value {
+            Value::Boolean(true) => self.trues += diff,
+            Value::Boolean(false) => self.falses += diff,
+            Value::Null => (), // ignore nulls
+            x => {
+                return Err(TypeMismatchSnafu {
+                    expected: ConcreteDataType::boolean_datatype(),
+                    actual: x.data_type(),
+                }
+                .build());
+            }
+        };
+        Ok(())
+    }
+
+    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
+        match aggr_fn {
+            AggregateFunc::Any => Ok(Value::from(self.trues > 0)),
+            AggregateFunc::All => Ok(Value::from(self.falses == 0)),
+            AggregateFunc::MaxBool => Ok(Value::from(self.trues > 0)),
+            AggregateFunc::MinBool => Ok(Value::from(self.falses == 0)),
+            _ => Err(InternalSnafu {
+                reason: format!(
+                    "Bool Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+            .build()),
+        }
+    }
+}
+
+/// Accumulates simple numeric values for sum over integer.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct SimpleNumber {
+    /// The accumulation of all non-NULL values observed.
+    accum: i128,
+    /// The number of non-NULL values observed.
+    non_nulls: Diff,
+}
+
+impl TryFrom<Vec<Value>> for SimpleNumber {
+    type Error = EvalError;
+
+    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
+        ensure!(
+            state.len() == 2,
+            InternalSnafu {
+                reason: "Number Accumulator state should have 2 values",
+            }
+        );
+        let mut iter = state.into_iter();
+
+        Ok(Self {
+            accum: Decimal128::try_from(iter.next().unwrap())
+                .map_err(err_try_from_val)?
+                .val(),
+            non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+        })
+    }
+}
+
+impl Accumulator for SimpleNumber {
+    fn into_state(self) -> Vec<Value> {
+        vec![
+            Value::Decimal128(Decimal128::new(self.accum, 38, 0)),
+            self.non_nulls.into(),
+        ]
+    }
+
+    fn update(
+        &mut self,
+        aggr_fn: &AggregateFunc,
+        value: Value,
+        diff: Diff,
+    ) -> Result<(), EvalError> {
+        ensure!(
+            matches!(
+                aggr_fn,
+                AggregateFunc::SumInt16
+                    | AggregateFunc::SumInt32
+                    | AggregateFunc::SumInt64
+                    | AggregateFunc::SumUInt16
+                    | AggregateFunc::SumUInt32
+                    | AggregateFunc::SumUInt64
+            ),
+            InternalSnafu {
+                reason: format!(
+                    "SimpleNumber Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+        );
+
+        let v = match (aggr_fn, value) {
+            (AggregateFunc::SumInt16, Value::Int16(x)) => i128::from(x),
+            (AggregateFunc::SumInt32, Value::Int32(x)) => i128::from(x),
+            (AggregateFunc::SumInt64, Value::Int64(x)) => i128::from(x),
+            (AggregateFunc::SumUInt16, Value::UInt16(x)) => i128::from(x),
+            (AggregateFunc::SumUInt32, Value::UInt32(x)) => i128::from(x),
+            (AggregateFunc::SumUInt64, Value::UInt64(x)) => i128::from(x),
+            (_f, Value::Null) => return Ok(()), // ignore null
+            (f, v) => {
+                let expected_datatype = f.signature().input;
+                return Err(TypeMismatchSnafu {
+                    expected: expected_datatype,
+                    actual: v.data_type(),
+                }
+                .build())?;
+            }
+        };
+
+        self.accum += v * i128::from(diff);
+
+        self.non_nulls += diff;
+        Ok(())
+    }
+
+    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
+        match aggr_fn {
+            AggregateFunc::SumInt16 | AggregateFunc::SumInt32 | AggregateFunc::SumInt64 => {
+                i64::try_from(self.accum)
+                    .map_err(|_e| OverflowSnafu {}.build())
+                    .map(Value::from)
+            }
+            AggregateFunc::SumUInt16 | AggregateFunc::SumUInt32 | AggregateFunc::SumUInt64 => {
+                u64::try_from(self.accum)
+                    .map_err(|_e| OverflowSnafu {}.build())
+                    .map(Value::from)
+            }
+            _ => Err(InternalSnafu {
+                reason: format!(
+                    "SimpleNumber Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+            .build()),
+        }
+    }
+}
+/// Accumulates float values for sum over floating numbers.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+
+pub struct Float {
+    /// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
+    /// accum will be set to zero if `non_nulls` is zero.
+    accum: OrderedF64,
+    /// Counts +inf
+    pos_infs: Diff,
+    /// Counts -inf
+    neg_infs: Diff,
+    /// Counts NaNs
+    nans: Diff,
+    /// Counts non-NULL values
+    non_nulls: Diff,
+}
+
+impl TryFrom<Vec<Value>> for Float {
+    type Error = EvalError;
+
+    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
+        ensure!(
+            state.len() == 5,
+            InternalSnafu {
+                reason: "Float Accumulator state should have 5 values",
+            }
+        );
+
+        let mut iter = state.into_iter();
+
+        let mut ret = Self {
+            accum: OrderedF64::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+            pos_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+            neg_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+            nans: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+            non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+        };
+
+        // This prevent counter-intuitive behavior of summing over no values
+        if ret.non_nulls == 0 {
+            ret.accum = OrderedFloat::from(0.0);
+        }
+
+        Ok(ret)
+    }
+}
+
+impl Accumulator for Float {
+    fn into_state(self) -> Vec<Value> {
+        vec![
+            self.accum.into(),
+            self.pos_infs.into(),
+            self.neg_infs.into(),
+            self.nans.into(),
+            self.non_nulls.into(),
+        ]
+    }
+
+    /// sum ignore null
+    fn update(
+        &mut self,
+        aggr_fn: &AggregateFunc,
+        value: Value,
+        diff: Diff,
+    ) -> Result<(), EvalError> {
+        ensure!(
+            matches!(
+                aggr_fn,
+                AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64
+            ),
+            InternalSnafu {
+                reason: format!(
+                    "Float Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+        );
+
+        let x = match (aggr_fn, value) {
+            (AggregateFunc::SumFloat32, Value::Float32(x)) => OrderedF64::from(*x as f64),
+            (AggregateFunc::SumFloat64, Value::Float64(x)) => OrderedF64::from(x),
+            (_f, Value::Null) => return Ok(()), // ignore null
+            (f, v) => {
+                let expected_datatype = f.signature().input;
+                return Err(TypeMismatchSnafu {
+                    expected: expected_datatype,
+                    actual: v.data_type(),
+                }
+                .build())?;
+            }
+        };
+
+        if x.is_nan() {
+            self.nans += diff;
+        } else if x.is_infinite() {
+            if x.is_sign_positive() {
+                self.pos_infs += diff;
+            } else {
+                self.neg_infs += diff;
+            }
+        } else {
+            self.accum += *(x * OrderedF64::from(diff as f64));
+        }
+
+        self.non_nulls += diff;
+        Ok(())
+    }
+
+    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
+        match aggr_fn {
+            AggregateFunc::SumFloat32 => Ok(Value::Float32(OrderedF32::from(self.accum.0 as f32))),
+            AggregateFunc::SumFloat64 => Ok(Value::Float64(self.accum)),
+            _ => Err(InternalSnafu {
+                reason: format!(
+                    "Float Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+            .build()),
+        }
+    }
+}
+
+/// Accumulates a single `Ord`ed `Value`, useful for min/max aggregations.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct OrdValue {
+    val: Option<Value>,
+    non_nulls: Diff,
+}
+
+impl TryFrom<Vec<Value>> for OrdValue {
+    type Error = EvalError;
+
+    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
+        ensure!(
+            state.len() == 2,
+            InternalSnafu {
+                reason: "OrdValue Accumulator state should have 2 values",
+            }
+        );
+
+        let mut iter = state.into_iter();
+
+        Ok(Self {
+            val: {
+                let v = iter.next().unwrap();
+                if v == Value::Null {
+                    None
+                } else {
+                    Some(v)
+                }
+            },
+            non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
+        })
+    }
+}
+
+impl Accumulator for OrdValue {
+    fn into_state(self) -> Vec<Value> {
+        vec![self.val.unwrap_or(Value::Null), self.non_nulls.into()]
+    }
+
+    /// min/max try to find results in all non-null values, if all values are null, the result is null.
+    /// count(col_name) gives the number of non-null values, count(*) gives the number of rows including nulls.
+    /// TODO(discord9): add count(*) as a aggr function
+    fn update(
+        &mut self,
+        aggr_fn: &AggregateFunc,
+        value: Value,
+        diff: Diff,
+    ) -> Result<(), EvalError> {
+        ensure!(
+            aggr_fn.is_max() || aggr_fn.is_min() || matches!(aggr_fn, AggregateFunc::Count),
+            InternalSnafu {
+                reason: format!(
+                    "OrdValue Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+        );
+        if diff <= 0 && (aggr_fn.is_max() || aggr_fn.is_min()) {
+            return Err(InternalSnafu {
+                reason: "OrdValue Accumulator does not support non-monotonic input for min/max aggregation".to_string(),
+            }.build());
+        }
+
+        // if aggr_fn is count, the incoming value type doesn't matter in type checking
+        // otherwise, type need to be the same or value can be null
+        let check_type_aggr_fn_and_arg_value =
+            ty_eq_without_precision(value.data_type(), aggr_fn.signature().input)
+                || matches!(aggr_fn, AggregateFunc::Count)
+                || value.is_null();
+        let check_type_aggr_fn_and_self_val = self
+            .val
+            .as_ref()
+            .map(|zelf| ty_eq_without_precision(zelf.data_type(), aggr_fn.signature().input))
+            .unwrap_or(true)
+            || matches!(aggr_fn, AggregateFunc::Count);
+
+        if !check_type_aggr_fn_and_arg_value {
+            return Err(TypeMismatchSnafu {
+                expected: aggr_fn.signature().input,
+                actual: value.data_type(),
+            }
+            .build());
+        } else if !check_type_aggr_fn_and_self_val {
+            return Err(TypeMismatchSnafu {
+                expected: aggr_fn.signature().input,
+                actual: self
+                    .val
+                    .as_ref()
+                    .map(|v| v.data_type())
+                    .unwrap_or(ConcreteDataType::null_datatype()),
+            }
+            .build());
+        }
+
+        let is_null = value.is_null();
+        if is_null {
+            return Ok(());
+        }
+
+        if !is_null {
+            // compile count(*) to count(true) to include null/non-nulls
+            // And the counts of non-null values are updated here
+            self.non_nulls += diff;
+
+            match aggr_fn.signature().generic_fn {
+                GenericFn::Max => {
+                    self.val = self
+                        .val
+                        .clone()
+                        .map(|v| v.max(value.clone()))
+                        .or_else(|| Some(value))
+                }
+                GenericFn::Min => {
+                    self.val = self
+                        .val
+                        .clone()
+                        .map(|v| v.min(value.clone()))
+                        .or_else(|| Some(value))
+                }
+
+                GenericFn::Count => (),
+                _ => unreachable!("already checked by ensure!"),
+            }
+        };
+        // min/max ignore nulls
+
+        Ok(())
+    }
+
+    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
+        if aggr_fn.is_max() || aggr_fn.is_min() {
+            Ok(self.val.clone().unwrap_or(Value::Null))
+        } else if matches!(aggr_fn, AggregateFunc::Count) {
+            Ok(self.non_nulls.into())
+        } else {
+            Err(InternalSnafu {
+                reason: format!(
+                    "OrdValue Accumulator does not support this aggregation function: {:?}",
+                    aggr_fn
+                ),
+            }
+            .build())
+        }
+    }
+}
+
 /// Accumulates values for the various types of accumulable aggregations.
 ///
 /// We assume that there are not more than 2^32 elements for the aggregation.
@@ -38,34 +534,407 @@ use crate::repr::Diff;
 /// The float accumulator performs accumulation with tolerance for floating point error.
 ///
 /// TODO(discord9): check for overflowing
+#[enum_dispatch(Accumulator)]
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub enum Accum {
    /// Accumulates boolean values.
-    Bool {
-        /// The number of `true` values observed.
-        trues: Diff,
-        /// The number of `false` values observed.
-        falses: Diff,
-    },
+    Bool(Bool),
    /// Accumulates simple numeric values.
-    SimpleNumber {
-        /// The accumulation of all non-NULL values observed.
-        accum: i128,
-        /// The number of non-NULL values observed.
-        non_nulls: Diff,
-    },
+    SimpleNumber(SimpleNumber),
    /// Accumulates float values.
-    Float {
-        /// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
-        /// accum will be set to zero if `non_nulls` is zero.
-        accum: OrderedF64,
-        /// Counts +inf
-        pos_infs: Diff,
-        /// Counts -inf
-        neg_infs: Diff,
-        /// Counts NaNs
-        nans: Diff,
-        /// Counts non-NULL values
-        non_nulls: Diff,
-    },
+    Float(Float),
+    /// Accumulate Values that impl `Ord`
+    OrdValue(OrdValue),
+}
+
+impl Accum {
+    pub fn new_accum(aggr_fn: &AggregateFunc) -> Result<Self, EvalError> {
+        Ok(match aggr_fn {
+            AggregateFunc::Any
+            | AggregateFunc::All
+            | AggregateFunc::MaxBool
+            | AggregateFunc::MinBool => Self::from(Bool {
+                trues: 0,
+                falses: 0,
+            }),
+            AggregateFunc::SumInt16
+            | AggregateFunc::SumInt32
+            | AggregateFunc::SumInt64
+            | AggregateFunc::SumUInt16
+            | AggregateFunc::SumUInt32
+            | AggregateFunc::SumUInt64 => Self::from(SimpleNumber {
+                accum: 0,
+                non_nulls: 0,
+            }),
+            AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => Self::from(Float {
+                accum: OrderedF64::from(0.0),
+                pos_infs: 0,
+                neg_infs: 0,
+                nans: 0,
+                non_nulls: 0,
+            }),
+            f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
+                Self::from(OrdValue {
+                    val: None,
+                    non_nulls: 0,
+                })
+            }
+            f => {
+                return Err(InternalSnafu {
+                    reason: format!(
+                        "Accumulator does not support this aggregation function: {:?}",
+                        f
+                    ),
+                }
+                .build());
+            }
+        })
+    }
+    pub fn try_into_accum(aggr_fn: &AggregateFunc, state: Vec<Value>) -> Result<Self, EvalError> {
+        match aggr_fn {
+            AggregateFunc::Any
+            | AggregateFunc::All
+            | AggregateFunc::MaxBool
+            | AggregateFunc::MinBool => Ok(Self::from(Bool::try_from(state)?)),
+            AggregateFunc::SumInt16
+            | AggregateFunc::SumInt32
+            | AggregateFunc::SumInt64
+            | AggregateFunc::SumUInt16
+            | AggregateFunc::SumUInt32
+            | AggregateFunc::SumUInt64 => Ok(Self::from(SimpleNumber::try_from(state)?)),
+            AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => {
+                Ok(Self::from(Float::try_from(state)?))
+            }
+            f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
+                Ok(Self::from(OrdValue::try_from(state)?))
+            }
+            f => Err(InternalSnafu {
+                reason: format!(
+                    "Accumulator does not support this aggregation function: {:?}",
+                    f
+                ),
+            }
+            .build()),
+        }
+    }
+}
+
+fn err_try_from_val<T: Display>(reason: T) -> EvalError {
+    TryFromValueSnafu {
+        msg: reason.to_string(),
+    }
+    .build()
+}
+
+/// compare type while ignore their precision, including `TimeStamp`, `Time`,
+/// `Duration`, `Interval`
+fn ty_eq_without_precision(left: ConcreteDataType, right: ConcreteDataType) -> bool {
+    left == right
+        || matches!(left, ConcreteDataType::Timestamp(..))
+            && matches!(right, ConcreteDataType::Timestamp(..))
+        || matches!(left, ConcreteDataType::Time(..)) && matches!(right, ConcreteDataType::Time(..))
+        || matches!(left, ConcreteDataType::Duration(..))
+            && matches!(right, ConcreteDataType::Duration(..))
+        || matches!(left, ConcreteDataType::Interval(..))
+            && matches!(right, ConcreteDataType::Interval(..))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    #[test]
+    fn test_accum() {
+        let testcases = vec![
+            (
+                AggregateFunc::SumInt32,
+                vec![(Value::Int32(1), 1), (Value::Null, 1)],
+                (
+                    Value::Int64(1),
+                    vec![Value::Decimal128(Decimal128::new(1, 38, 0)), 1i64.into()],
+                ),
+            ),
+            (
+                AggregateFunc::SumFloat32,
+                vec![(Value::Float32(OrderedF32::from(1.0)), 1), (Value::Null, 1)],
+                (
+                    Value::Float32(OrderedF32::from(1.0)),
+                    vec![
+                        Value::Float64(OrderedF64::from(1.0)),
+                        0i64.into(),
+                        0i64.into(),
+                        0i64.into(),
+                        1i64.into(),
+                    ],
+                ),
+            ),
+            (
+                AggregateFunc::MaxInt32,
+                vec![(Value::Int32(1), 1), (Value::Int32(2), 1), (Value::Null, 1)],
+                (Value::Int32(2), vec![Value::Int32(2), 2i64.into()]),
+            ),
+            (
+                AggregateFunc::MinInt32,
+                vec![(Value::Int32(2), 1), (Value::Int32(1), 1), (Value::Null, 1)],
+                (Value::Int32(1), vec![Value::Int32(1), 2i64.into()]),
+            ),
+            (
+                AggregateFunc::MaxFloat32,
+                vec![
+                    (Value::Float32(OrderedF32::from(1.0)), 1),
+                    (Value::Float32(OrderedF32::from(2.0)), 1),
+                    (Value::Null, 1),
+                ],
+                (
+                    Value::Float32(OrderedF32::from(2.0)),
+                    vec![Value::Float32(OrderedF32::from(2.0)), 2i64.into()],
+                ),
+            ),
+            (
+                AggregateFunc::MaxDateTime,
+                vec![
+                    (Value::DateTime(DateTime::from(0)), 1),
+                    (Value::DateTime(DateTime::from(1)), 1),
+                    (Value::Null, 1),
+                ],
+                (
+                    Value::DateTime(DateTime::from(1)),
+                    vec![Value::DateTime(DateTime::from(1)), 2i64.into()],
+                ),
+            ),
+            (
+                AggregateFunc::Count,
+                vec![
+                    (Value::Int32(1), 1),
+                    (Value::Int32(2), 1),
+                    (Value::Null, 1),
+                    (Value::Null, 1),
+                ],
+                (2i64.into(), vec![Value::Null, 2i64.into()]),
+            ),
+            (
+                AggregateFunc::Any,
+                vec![
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(true), 1),
+                    (Value::Null, 1),
+                ],
+                (
+                    Value::Boolean(true),
+                    vec![Value::from(1i64), Value::from(2i64)],
+                ),
+            ),
+            (
+                AggregateFunc::All,
+                vec![
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(true), 1),
+                    (Value::Null, 1),
+                ],
+                (
+                    Value::Boolean(false),
+                    vec![Value::from(1i64), Value::from(2i64)],
+                ),
+            ),
+            (
+                AggregateFunc::MaxBool,
+                vec![
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(true), 1),
+                    (Value::Null, 1),
+                ],
+                (
+                    Value::Boolean(true),
+                    vec![Value::from(1i64), Value::from(2i64)],
+                ),
+            ),
+            (
+                AggregateFunc::MinBool,
+                vec![
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(false), 1),
+                    (Value::Boolean(true), 1),
+                    (Value::Null, 1),
+                ],
+                (
+                    Value::Boolean(false),
+                    vec![Value::from(1i64), Value::from(2i64)],
+                ),
+            ),
+        ];
+
+        for (aggr_fn, input, (eval_res, state)) in testcases {
+            let create_and_insert = || -> Result<Accum, EvalError> {
+                let mut acc = Accum::new_accum(&aggr_fn)?;
+                acc.update_batch(&aggr_fn, input.clone())?;
+                let row = acc.into_state();
+                let acc = Accum::try_into_accum(&aggr_fn, row)?;
+                Ok(acc)
+            };
+            let acc = match create_and_insert() {
+                Ok(acc) => acc,
+                Err(err) => panic!(
+                    "Failed to create accum for {:?} with input {:?} with error: {:?}",
+                    aggr_fn, input, err
+                ),
+            };
+
+            if acc.eval(&aggr_fn).unwrap() != eval_res {
+                panic!(
+                    "Failed to eval accum for {:?} with input {:?}, expect {:?}, got {:?}",
+                    aggr_fn,
+                    input,
+                    eval_res,
+                    acc.eval(&aggr_fn).unwrap()
+                );
+            }
+            let actual_state = acc.into_state();
+            if actual_state != state {
+                panic!(
+                    "Failed to cast into state from accum for {:?} with input {:?}, expect state {:?}, got state {:?}",
+                    aggr_fn,
+                    input,
+                    state,
+                    actual_state
+                );
+            }
+        }
+    }
+    #[test]
+    fn test_fail_path_accum() {
+        {
+            let bool_accum = Bool::try_from(vec![Value::Null]);
+            assert!(matches!(bool_accum, Err(EvalError::Internal { .. })));
+        }
+
+        {
+            let mut bool_accum = Bool::try_from(vec![1i64.into(), 1i64.into()]).unwrap();
+            // serde
+            let bool_accum_serde = serde_json::to_string(&bool_accum).unwrap();
+            let bool_accum_de = serde_json::from_str::<Bool>(&bool_accum_serde).unwrap();
+            assert_eq!(bool_accum, bool_accum_de);
+            assert!(matches!(
+                bool_accum.update(&AggregateFunc::MaxDate, 1.into(), 1),
+                Err(EvalError::Internal { .. })
+            ));
+            assert!(matches!(
+                bool_accum.update(&AggregateFunc::Any, 1.into(), 1),
+                Err(EvalError::TypeMismatch { .. })
+            ));
+            assert!(matches!(
+                bool_accum.eval(&AggregateFunc::MaxDate),
+                Err(EvalError::Internal { .. })
+            ));
+        }
+
+        {
+            let ret = SimpleNumber::try_from(vec![Value::Null]);
+            assert!(matches!(ret, Err(EvalError::Internal { .. })));
+            let mut accum =
+                SimpleNumber::try_from(vec![Decimal128::new(0, 38, 0).into(), 0i64.into()])
+                    .unwrap();
+
+            assert!(matches!(
+                accum.update(&AggregateFunc::All, 0.into(), 1),
+                Err(EvalError::Internal { .. })
+            ));
+            assert!(matches!(
+                accum.update(&AggregateFunc::SumInt64, 0i32.into(), 1),
+                Err(EvalError::TypeMismatch { .. })
+            ));
+            assert!(matches!(
+                accum.eval(&AggregateFunc::All),
+                Err(EvalError::Internal { .. })
+            ));
+            accum
+                .update(&AggregateFunc::SumInt64, 1i64.into(), 1)
+                .unwrap();
+            accum
+                .update(&AggregateFunc::SumInt64, i64::MAX.into(), 1)
+                .unwrap();
+            assert!(matches!(
+                accum.eval(&AggregateFunc::SumInt64),
+                Err(EvalError::Overflow { .. })
+            ));
+        }
+
+        {
+            let ret = Float::try_from(vec![2f64.into(), 0i64.into(), 0i64.into(), 0i64.into()]);
+            assert!(matches!(ret, Err(EvalError::Internal { .. })));
+            let mut accum = Float::try_from(vec![
+                2f64.into(),
+                0i64.into(),
+                0i64.into(),
+                0i64.into(),
+                1i64.into(),
+            ])
+            .unwrap();
+            accum
+                .update(&AggregateFunc::SumFloat64, 2f64.into(), -1)
+                .unwrap();
+            assert!(matches!(
+                accum.update(&AggregateFunc::All, 0.into(), 1),
+                Err(EvalError::Internal { .. })
+            ));
+            assert!(matches!(
+                accum.update(&AggregateFunc::SumFloat64, 0.0f32.into(), 1),
+                Err(EvalError::TypeMismatch { .. })
+            ));
+            // no record, no accum
+            assert_eq!(
+                accum.eval(&AggregateFunc::SumFloat64).unwrap(),
+                0.0f64.into()
+            );
+
+            assert!(matches!(
+                accum.eval(&AggregateFunc::All),
+                Err(EvalError::Internal { .. })
+            ));
+
+            accum
+                .update(&AggregateFunc::SumFloat64, f64::INFINITY.into(), 1)
+                .unwrap();
+            accum
+                .update(&AggregateFunc::SumFloat64, (-f64::INFINITY).into(), 1)
+                .unwrap();
+            accum
+                .update(&AggregateFunc::SumFloat64, f64::NAN.into(), 1)
+                .unwrap();
+        }
+
+        {
+            let ret = OrdValue::try_from(vec![Value::Null]);
+            assert!(matches!(ret, Err(EvalError::Internal { .. })));
+            let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
+            assert!(matches!(
+                accum.update(&AggregateFunc::All, 0.into(), 1),
+                Err(EvalError::Internal { .. })
+            ));
+            accum
+                .update(&AggregateFunc::MaxInt16, 1i16.into(), 1)
+                .unwrap();
+            assert!(matches!(
+                accum.update(&AggregateFunc::MaxInt16, 0i32.into(), 1),
+                Err(EvalError::TypeMismatch { .. })
+            ));
+            assert!(matches!(
+                accum.update(&AggregateFunc::MaxInt16, 0i16.into(), -1),
+                Err(EvalError::Internal { .. })
+            ));
+            accum
+                .update(&AggregateFunc::MaxInt16, Value::Null, 1)
+                .unwrap();
+        }
+
+        // insert uint64 into max_int64 should fail
+        {
+            let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
+            assert!(matches!(
+                accum.update(&AggregateFunc::MaxInt64, 0u64.into(), 1),
+                Err(EvalError::TypeMismatch { .. })
+            ));
+        }
+    }
 }
--- a/src/flow/src/expr/relation/func.rs
+++ b/src/flow/src/expr/relation/func.rs
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::any::type_name;
-
 use common_time::{Date, DateTime};
 use datatypes::prelude::ConcreteDataType;
 use datatypes::value::{OrderedF32, OrderedF64, Value};
 use serde::{Deserialize, Serialize};

 use crate::expr::error::{EvalError, TryFromValueSnafu, TypeMismatchSnafu};
-use crate::expr::relation::accum::Accum;
+use crate::expr::relation::accum::{Accum, Accumulator};
 use crate::repr::Diff;

 /// Aggregate functions that can be applied to a group of rows.
@@ -83,3 +81,280 @@ pub enum AggregateFunc {
    Any,
    All,
 }
+
+impl AggregateFunc {
+    pub fn is_max(&self) -> bool {
+        self.signature().generic_fn == GenericFn::Max
+    }
+
+    pub fn is_min(&self) -> bool {
+        self.signature().generic_fn == GenericFn::Min
+    }
+
+    pub fn is_sum(&self) -> bool {
+        self.signature().generic_fn == GenericFn::Sum
+    }
+
+    /// Eval value, diff with accumulator
+    ///
+    /// Expect self to be accumulable aggregate functio, i.e. sum/count
+    ///
+    /// TODO(discord9): deal with overflow&better accumulator
+    pub fn eval_diff_accumulable<I>(
+        &self,
+        accum: Vec<Value>,
+        value_diffs: I,
+    ) -> Result<(Value, Vec<Value>), EvalError>
+    where
+        I: IntoIterator<Item = (Value, Diff)>,
+    {
+        let mut accum = if accum.is_empty() {
+            Accum::new_accum(self)?
+        } else {
+            Accum::try_into_accum(self, accum)?
+        };
+        accum.update_batch(self, value_diffs)?;
+        let res = accum.eval(self)?;
+        Ok((res, accum.into_state()))
+    }
+}
+
+pub struct Signature {
+    pub input: ConcreteDataType,
+    pub output: ConcreteDataType,
+    pub generic_fn: GenericFn,
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum GenericFn {
+    Max,
+    Min,
+    Sum,
+    Count,
+    Any,
+    All,
+}
+
+impl AggregateFunc {
+    /// all concrete datatypes with precision types will be returned with largest possible variant
+    /// as a exception, count have a signature of `null -> i64`, but it's actually `anytype -> i64`
+    pub fn signature(&self) -> Signature {
+        match self {
+            AggregateFunc::MaxInt16 => Signature {
+                input: ConcreteDataType::int16_datatype(),
+                output: ConcreteDataType::int16_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxInt32 => Signature {
+                input: ConcreteDataType::int32_datatype(),
+                output: ConcreteDataType::int32_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxInt64 => Signature {
+                input: ConcreteDataType::int64_datatype(),
+                output: ConcreteDataType::int64_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxUInt16 => Signature {
+                input: ConcreteDataType::uint16_datatype(),
+                output: ConcreteDataType::uint16_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxUInt32 => Signature {
+                input: ConcreteDataType::uint32_datatype(),
+                output: ConcreteDataType::uint32_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxUInt64 => Signature {
+                input: ConcreteDataType::uint64_datatype(),
+                output: ConcreteDataType::uint64_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxFloat32 => Signature {
+                input: ConcreteDataType::float32_datatype(),
+                output: ConcreteDataType::float32_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxFloat64 => Signature {
+                input: ConcreteDataType::float64_datatype(),
+                output: ConcreteDataType::float64_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxBool => Signature {
+                input: ConcreteDataType::boolean_datatype(),
+                output: ConcreteDataType::boolean_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxString => Signature {
+                input: ConcreteDataType::string_datatype(),
+                output: ConcreteDataType::string_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxDate => Signature {
+                input: ConcreteDataType::date_datatype(),
+                output: ConcreteDataType::date_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxDateTime => Signature {
+                input: ConcreteDataType::datetime_datatype(),
+                output: ConcreteDataType::datetime_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxTimestamp => Signature {
+                input: ConcreteDataType::timestamp_second_datatype(),
+                output: ConcreteDataType::timestamp_second_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxTime => Signature {
+                input: ConcreteDataType::time_second_datatype(),
+                output: ConcreteDataType::time_second_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxDuration => Signature {
+                input: ConcreteDataType::duration_second_datatype(),
+                output: ConcreteDataType::duration_second_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MaxInterval => Signature {
+                input: ConcreteDataType::interval_year_month_datatype(),
+                output: ConcreteDataType::interval_year_month_datatype(),
+                generic_fn: GenericFn::Max,
+            },
+            AggregateFunc::MinInt16 => Signature {
+                input: ConcreteDataType::int16_datatype(),
+                output: ConcreteDataType::int16_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinInt32 => Signature {
+                input: ConcreteDataType::int32_datatype(),
+                output: ConcreteDataType::int32_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinInt64 => Signature {
+                input: ConcreteDataType::int64_datatype(),
+                output: ConcreteDataType::int64_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinUInt16 => Signature {
+                input: ConcreteDataType::uint16_datatype(),
+                output: ConcreteDataType::uint16_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinUInt32 => Signature {
+                input: ConcreteDataType::uint32_datatype(),
+                output: ConcreteDataType::uint32_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinUInt64 => Signature {
+                input: ConcreteDataType::uint64_datatype(),
+                output: ConcreteDataType::uint64_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinFloat32 => Signature {
+                input: ConcreteDataType::float32_datatype(),
+                output: ConcreteDataType::float32_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinFloat64 => Signature {
+                input: ConcreteDataType::float64_datatype(),
+                output: ConcreteDataType::float64_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinBool => Signature {
+                input: ConcreteDataType::boolean_datatype(),
+                output: ConcreteDataType::boolean_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinString => Signature {
+                input: ConcreteDataType::string_datatype(),
+                output: ConcreteDataType::string_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinDate => Signature {
+                input: ConcreteDataType::date_datatype(),
+                output: ConcreteDataType::date_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinDateTime => Signature {
+                input: ConcreteDataType::datetime_datatype(),
+                output: ConcreteDataType::datetime_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinTimestamp => Signature {
+                input: ConcreteDataType::timestamp_second_datatype(),
+                output: ConcreteDataType::timestamp_second_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinTime => Signature {
+                input: ConcreteDataType::time_second_datatype(),
+                output: ConcreteDataType::time_second_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinDuration => Signature {
+                input: ConcreteDataType::duration_second_datatype(),
+                output: ConcreteDataType::duration_second_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::MinInterval => Signature {
+                input: ConcreteDataType::interval_year_month_datatype(),
+                output: ConcreteDataType::interval_year_month_datatype(),
+                generic_fn: GenericFn::Min,
+            },
+            AggregateFunc::SumInt16 => Signature {
+                input: ConcreteDataType::int16_datatype(),
+                output: ConcreteDataType::int16_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::SumInt32 => Signature {
+                input: ConcreteDataType::int32_datatype(),
+                output: ConcreteDataType::int32_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::SumInt64 => Signature {
+                input: ConcreteDataType::int64_datatype(),
+                output: ConcreteDataType::int64_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::SumUInt16 => Signature {
+                input: ConcreteDataType::uint16_datatype(),
+                output: ConcreteDataType::uint16_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::SumUInt32 => Signature {
+                input: ConcreteDataType::uint32_datatype(),
+                output: ConcreteDataType::uint32_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::SumUInt64 => Signature {
+                input: ConcreteDataType::uint64_datatype(),
+                output: ConcreteDataType::uint64_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::SumFloat32 => Signature {
+                input: ConcreteDataType::float32_datatype(),
+                output: ConcreteDataType::float32_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::SumFloat64 => Signature {
+                input: ConcreteDataType::float64_datatype(),
+                output: ConcreteDataType::float64_datatype(),
+                generic_fn: GenericFn::Sum,
+            },
+            AggregateFunc::Count => Signature {
+                input: ConcreteDataType::null_datatype(),
+                output: ConcreteDataType::int64_datatype(),
+                generic_fn: GenericFn::Count,
+            },
+            AggregateFunc::Any => Signature {
+                input: ConcreteDataType::boolean_datatype(),
+                output: ConcreteDataType::boolean_datatype(),
+                generic_fn: GenericFn::Any,
+            },
+            AggregateFunc::All => Signature {
+                input: ConcreteDataType::boolean_datatype(),
+                output: ConcreteDataType::boolean_datatype(),
+                generic_fn: GenericFn::All,
+            },
+        }
+    }
+}
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -17,4 +17,5 @@
 // allow unused for now because it should be use later
 mod adapter;
 mod expr;
+mod plan;
 mod repr;
--- a/src/flow/src/plan.rs
+++ b/src/flow/src/plan.rs
@@ -0,0 +1,98 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! This module contain basic definition for dataflow's plan
+//! that can be translate to hydro dataflow
+
+mod join;
+mod reduce;
+
+use serde::{Deserialize, Serialize};
+
+pub(crate) use self::reduce::{AccumulablePlan, KeyValPlan, ReducePlan};
+use crate::expr::{
+    AggregateExpr, EvalError, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
+};
+use crate::plan::join::JoinPlan;
+use crate::repr::{DiffRow, RelationType};
+
+#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
+pub struct TypedPlan {
+    /// output type of the relation
+    pub typ: RelationType,
+    pub plan: Plan,
+}
+
+/// TODO(discord9): support `TableFunc`（by define FlatMap that map 1 to n)
+#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
+pub enum Plan {
+    /// A constant collection of rows.
+    Constant { rows: Vec<DiffRow> },
+    /// Get CDC data from an source, be it external reference to an existing source or an internal
+    /// reference to a `Let` identifier
+    Get { id: Id },
+    /// Create a temporary collection from given `value``, and make this bind only available
+    /// in scope of `body`
+    Let {
+        id: LocalId,
+        value: Box<Plan>,
+        body: Box<Plan>,
+    },
+    /// Map, Filter, and Project operators.
+    Mfp {
+        /// The input collection.
+        input: Box<Plan>,
+        /// Linear operator to apply to each record.
+        mfp: MapFilterProject,
+    },
+    /// Reduce operator, aggregation by key assembled from KeyValPlan
+    Reduce {
+        /// The input collection.
+        input: Box<Plan>,
+        /// A plan for changing input records into key, value pairs.
+        key_val_plan: KeyValPlan,
+        /// A plan for performing the reduce.
+        ///
+        /// The implementation of reduction has several different strategies based
+        /// on the properties of the reduction, and the input itself.
+        reduce_plan: ReducePlan,
+    },
+    /// A multiway relational equijoin, with fused map, filter, and projection.
+    ///
+    /// This stage performs a multiway join among `inputs`, using the equality
+    /// constraints expressed in `plan`. The plan also describes the implementation
+    /// strategy we will use, and any pushed down per-record work.
+    Join {
+        /// An ordered list of inputs that will be joined.
+        inputs: Vec<Plan>,
+        /// Detailed information about the implementation of the join.
+        ///
+        /// This includes information about the implementation strategy, but also
+        /// any map, filter, project work that we might follow the join with, but
+        /// potentially pushed down into the implementation of the join.
+        plan: JoinPlan,
+    },
+    /// Adds the contents of the input collections.
+    ///
+    /// Importantly, this is *multiset* union, so the multiplicities of records will
+    /// add. This is in contrast to *set* union, where the multiplicities would be
+    /// capped at one. A set union can be formed with `Union` followed by `Reduce`
+    /// implementing the "distinct" operator.
+    Union {
+        /// The input collections
+        inputs: Vec<Plan>,
+        /// Whether to consolidate the output, e.g., cancel negated records.
+        consolidate_output: bool,
+    },
+}
--- a/src/flow/src/plan/join.rs
+++ b/src/flow/src/plan/join.rs
@@ -0,0 +1,78 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+use crate::expr::ScalarExpr;
+use crate::plan::SafeMfpPlan;
+
+/// TODO(discord9): consider impl more join strategies
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
+pub enum JoinPlan {
+    Linear(LinearJoinPlan),
+}
+
+/// Determine if a given row should stay in the output. And apply a map filter project before output the row
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
+pub struct JoinFilter {
+    /// each element in the outer vector will check if each expr in itself can be eval to same value
+    /// if not, the row will be filtered out. Useful for equi-join(join based on equality of some columns)
+    pub ready_equivalences: Vec<Vec<ScalarExpr>>,
+    /// Apply a map filter project before output the row
+    pub before: SafeMfpPlan,
+}
+
+/// A plan for the execution of a linear join.
+///
+/// A linear join is a sequence of stages, each of which introduces
+/// a new collection. Each stage is represented by a [LinearStagePlan].
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
+pub struct LinearJoinPlan {
+    /// The source relation from which we start the join.
+    pub source_relation: usize,
+    /// The arrangement to use for the source relation, if any
+    pub source_key: Option<Vec<ScalarExpr>>,
+    /// An initial closure to apply before any stages.
+    ///
+    /// Values of `None` indicate the identity closure.
+    pub initial_closure: Option<JoinFilter>,
+    /// A *sequence* of stages to apply one after the other.
+    pub stage_plans: Vec<LinearStagePlan>,
+    /// A concluding filter to apply after the last stage.
+    ///
+    /// Values of `None` indicate the identity closure.
+    pub final_closure: Option<JoinFilter>,
+}
+
+/// A plan for the execution of one stage of a linear join.
+///
+/// Each stage is a binary join between the current accumulated
+/// join results, and a new collection. The former is referred to
+/// as the "stream" and the latter the "lookup".
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
+pub struct LinearStagePlan {
+    /// The index of the relation into which we will look up.
+    pub lookup_relation: usize,
+    /// The key expressions to use for the stream relation.
+    pub stream_key: Vec<ScalarExpr>,
+    /// Columns to retain from the stream relation.
+    /// These columns are those that are not redundant with `stream_key`,
+    /// and cannot be read out of the key component of an arrangement.
+    pub stream_thinning: Vec<usize>,
+    /// The key expressions to use for the lookup relation.
+    pub lookup_key: Vec<ScalarExpr>,
+    /// The closure to apply to the concatenation of the key columns,
+    /// the stream value columns, and the lookup value colunms.
+    pub closure: JoinFilter,
+}
--- a/src/flow/src/plan/reduce.rs
+++ b/src/flow/src/plan/reduce.rs
@@ -0,0 +1,50 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+use crate::expr::{AggregateExpr, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr};
+
+#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
+pub struct KeyValPlan {
+    pub key_plan: SafeMfpPlan,
+    pub val_plan: SafeMfpPlan,
+}
+
+/// TODO(discord9): def&impl of Hierarchical aggregates(for min/max with support to deletion) and
+/// basic aggregates(for other aggregate functions) and mixed aggregate
+#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
+pub enum ReducePlan {
+    /// Plan for not computing any aggregations, just determining the set of
+    /// distinct keys.
+    Distinct,
+    /// Plan for computing only accumulable aggregations.
+    /// Including simple functions like `sum`, `count`, `min/max`(without deletion)
+    Accumulable(AccumulablePlan),
+}
+
+/// Accumulable plan for the execution of a reduction.
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
+pub struct AccumulablePlan {
+    /// All of the aggregations we were asked to compute, stored
+    /// in order.
+    pub full_aggrs: Vec<AggregateExpr>,
+    /// All of the non-distinct accumulable aggregates.
+    /// Each element represents:
+    /// (index of aggr output, index of value among inputs, aggr expr)
+    /// These will all be rendered together in one dataflow fragment.
+    pub simple_aggrs: Vec<(usize, usize, AggregateExpr)>,
+    /// Same as above but for all of the `DISTINCT` accumulable aggregations.
+    pub distinct_aggrs: Vec<(usize, usize, AggregateExpr)>,
+}
--- a/src/flow/src/repr.rs
+++ b/src/flow/src/repr.rs
@@ -33,7 +33,10 @@ use snafu::ResultExt;

 use crate::expr::error::{CastValueSnafu, EvalError};

-/// System-wide Record count difference type.
+/// System-wide Record count difference type. Useful for capture data change
+///
+/// i.e. +1 means insert one record, -1 means remove,
+/// and +/-n means insert/remove multiple duplicate records.
 pub type Diff = i64;

 /// System-wide default timestamp type
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -28,6 +28,7 @@ use api::v1::meta::Role;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use catalog::CatalogManagerRef;
+use client::OutputData;
 use common_base::Plugins;
 use common_config::KvBackendConfig;
 use common_error::ext::BoxedError;
@@ -39,14 +40,16 @@ use common_procedure::local::{LocalManager, ManagerConfig};
 use common_procedure::options::ProcedureConfig;
 use common_procedure::ProcedureManagerRef;
 use common_query::Output;
-use common_telemetry::error;
 use common_telemetry::logging::info;
+use common_telemetry::{error, tracing};
 use log_store::raft_engine::RaftEngineBackend;
 use meta_client::client::{MetaClient, MetaClientBuilder};
 use meta_client::MetaClientOptions;
 use operator::delete::DeleterRef;
 use operator::insert::InserterRef;
 use operator::statement::StatementExecutor;
+use prometheus::HistogramTimer;
+use query::metrics::OnDone;
 use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
 use query::plan::LogicalPlan;
 use query::query_engine::options::{validate_catalog_and_schema, QueryOptions};
@@ -85,7 +88,6 @@ use crate::error::{
 };
 use crate::frontend::{FrontendOptions, TomlSerializable};
 use crate::heartbeat::HeartbeatTask;
-use crate::metrics;
 use crate::script::ScriptExecutor;

 #[async_trait]
@@ -275,8 +277,8 @@ impl Instance {
 impl SqlQueryHandler for Instance {
    type Error = Error;

+    #[tracing::instrument(skip_all)]
    async fn do_query(&self, query: &str, query_ctx: QueryContextRef) -> Vec<Result<Output>> {
-        let _timer = metrics::METRIC_HANDLE_SQL_ELAPSED.start_timer();
        let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
        let query_interceptor = query_interceptor_opt.as_ref();
        let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) {
@@ -336,7 +338,6 @@ impl SqlQueryHandler for Instance {
    }

    async fn do_exec_plan(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result<Output> {
-        let _timer = metrics::METRIC_EXEC_PLAN_ELAPSED.start_timer();
        // plan should be prepared before exec
        // we'll do check there
        self.query_engine
@@ -345,6 +346,7 @@ impl SqlQueryHandler for Instance {
            .context(ExecLogicalPlanSnafu)
    }

+    #[tracing::instrument(skip_all)]
    async fn do_promql_query(
        &self,
        query: &PromQuery,
@@ -398,14 +400,27 @@ impl SqlQueryHandler for Instance {
    }
 }

+/// Attaches a timer to the output and observes it once the output is exhausted.
+pub fn attach_timer(output: Output, timer: HistogramTimer) -> Output {
+    match output.data {
+        OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => output,
+        OutputData::Stream(stream) => {
+            let stream = OnDone::new(stream, move || {
+                timer.observe_duration();
+            });
+            Output::new(OutputData::Stream(Box::pin(stream)), output.meta)
+        }
+    }
+}
+
 #[async_trait]
 impl PrometheusHandler for Instance {
+    #[tracing::instrument(skip_all)]
    async fn do_query(
        &self,
        query: &PromQuery,
        query_ctx: QueryContextRef,
    ) -> server_error::Result<Output> {
-        let _timer = metrics::METRIC_HANDLE_PROMQL_ELAPSED.start_timer();
        let interceptor = self
            .plugins
            .get::<PromQueryInterceptorRef<server_error::Error>>();
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -20,6 +20,7 @@ use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_meta::table_name::TableName;
 use common_query::Output;
+use common_telemetry::tracing;
 use query::parser::PromQuery;
 use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
 use servers::query_handler::grpc::GrpcQueryHandler;
@@ -31,7 +32,8 @@ use crate::error::{
    Error, IncompleteGrpcRequestSnafu, NotSupportedSnafu, PermissionSnafu, Result,
    TableOperationSnafu,
 };
-use crate::instance::Instance;
+use crate::instance::{attach_timer, Instance};
+use crate::metrics::{GRPC_HANDLE_PROMQL_ELAPSED, GRPC_HANDLE_SQL_ELAPSED};

 #[async_trait]
 impl GrpcQueryHandler for Instance {
@@ -59,6 +61,7 @@ impl GrpcQueryHandler for Instance {
                })?;
                match query {
                    Query::Sql(sql) => {
+                        let timer = GRPC_HANDLE_SQL_ELAPSED.start_timer();
                        let mut result = SqlQueryHandler::do_query(self, &sql, ctx.clone()).await;
                        ensure!(
                            result.len() == 1,
@@ -66,7 +69,8 @@ impl GrpcQueryHandler for Instance {
                                feat: "execute multiple statements in SQL query string through GRPC interface"
                            }
                        );
-                        result.remove(0)?
+                        let output = result.remove(0)?;
+                        attach_timer(output, timer)
                    }
                    Query::LogicalPlan(_) => {
                        return NotSupportedSnafu {
@@ -75,6 +79,7 @@ impl GrpcQueryHandler for Instance {
                        .fail();
                    }
                    Query::PromRangeQuery(promql) => {
+                        let timer = GRPC_HANDLE_PROMQL_ELAPSED.start_timer();
                        let prom_query = PromQuery {
                            query: promql.query,
                            start: promql.start,
@@ -89,7 +94,8 @@ impl GrpcQueryHandler for Instance {
                                feat: "execute multiple statements in PromQL query string through GRPC interface"
                            }
                        );
-                        result.remove(0)?
+                        let output = result.remove(0)?;
+                        attach_timer(output, timer)
                    }
                }
            }
@@ -107,7 +113,7 @@ impl GrpcQueryHandler for Instance {
                            .statement_executor
                            .create_table_inner(&mut expr, None, &ctx)
                            .await?;
-                        Output::AffectedRows(0)
+                        Output::new_with_affected_rows(0)
                    }
                    DdlExpr::Alter(expr) => self.statement_executor.alter_table_inner(expr).await?,
                    DdlExpr::CreateDatabase(expr) => {
@@ -173,6 +179,7 @@ fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryConte
 }

 impl Instance {
+    #[tracing::instrument(skip_all)]
    pub async fn handle_inserts(
        &self,
        requests: InsertRequests,
@@ -184,6 +191,7 @@ impl Instance {
            .context(TableOperationSnafu)
    }

+    #[tracing::instrument(skip_all)]
    pub async fn handle_row_inserts(
        &self,
        requests: RowInsertRequests,
@@ -195,6 +203,7 @@ impl Instance {
            .context(TableOperationSnafu)
    }

+    #[tracing::instrument(skip_all)]
    pub async fn handle_metric_row_inserts(
        &self,
        requests: RowInsertRequests,
@@ -207,6 +216,7 @@ impl Instance {
            .context(TableOperationSnafu)
    }

+    #[tracing::instrument(skip_all)]
    pub async fn handle_deletes(
        &self,
        requests: DeleteRequests,
@@ -218,6 +228,7 @@ impl Instance {
            .context(TableOperationSnafu)
    }

+    #[tracing::instrument(skip_all)]
    pub async fn handle_row_deletes(
        &self,
        requests: RowDeleteRequests,
--- a/src/frontend/src/instance/influxdb.rs
+++ b/src/frontend/src/instance/influxdb.rs
@@ -15,8 +15,9 @@
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_error::ext::BoxedError;
-use servers::error::AuthSnafu;
+use servers::error::{AuthSnafu, Error};
 use servers::influxdb::InfluxdbRequest;
+use servers::interceptor::{LineProtocolInterceptor, LineProtocolInterceptorRef};
 use servers::query_handler::InfluxdbLineProtocolHandler;
 use session::context::QueryContextRef;
 use snafu::ResultExt;
@@ -36,6 +37,9 @@ impl InfluxdbLineProtocolHandler for Instance {
            .check_permission(ctx.current_user(), PermissionReq::LineProtocol)
            .context(AuthSnafu)?;

+        let interceptor_ref = self.plugins.get::<LineProtocolInterceptorRef<Error>>();
+        interceptor_ref.pre_execute(&request.lines, ctx.clone())?;
+
        let requests = request.try_into()?;
        let _ = self
            .handle_row_inserts(requests, ctx)
--- a/src/frontend/src/instance/opentsdb.rs
+++ b/src/frontend/src/instance/opentsdb.rs
@@ -15,6 +15,7 @@
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_error::ext::BoxedError;
+use common_telemetry::tracing;
 use servers::error as server_error;
 use servers::error::AuthSnafu;
 use servers::opentsdb::codec::DataPoint;
@@ -27,6 +28,7 @@ use crate::instance::Instance;

 #[async_trait]
 impl OpentsdbProtocolHandler for Instance {
+    #[tracing::instrument(skip_all, fields(protocol = "opentsdb"))]
    async fn exec(
        &self,
        data_points: Vec<DataPoint>,
@@ -45,8 +47,8 @@ impl OpentsdbProtocolHandler for Instance {
            .map_err(BoxedError::new)
            .context(servers::error::ExecuteGrpcQuerySnafu)?;

-        Ok(match output {
-            common_query::Output::AffectedRows(rows) => rows,
+        Ok(match output.data {
+            common_query::OutputData::AffectedRows(rows) => rows,
            _ => unreachable!(),
        })
    }
--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -15,6 +15,7 @@
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_error::ext::BoxedError;
+use common_telemetry::tracing;
 use opentelemetry_proto::tonic::collector::metrics::v1::{
    ExportMetricsServiceRequest, ExportMetricsServiceResponse,
 };
@@ -22,6 +23,7 @@ use opentelemetry_proto::tonic::collector::trace::v1::{
    ExportTraceServiceRequest, ExportTraceServiceResponse,
 };
 use servers::error::{self, AuthSnafu, Result as ServerResult};
+use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
 use servers::otlp;
 use servers::otlp::plugin::TraceParserRef;
 use servers::query_handler::OpenTelemetryProtocolHandler;
@@ -33,6 +35,7 @@ use crate::metrics::{OTLP_METRICS_ROWS, OTLP_TRACES_ROWS};

 #[async_trait]
 impl OpenTelemetryProtocolHandler for Instance {
+    #[tracing::instrument(skip_all)]
    async fn metrics(
        &self,
        request: ExportMetricsServiceRequest,
@@ -43,6 +46,12 @@ impl OpenTelemetryProtocolHandler for Instance {
            .as_ref()
            .check_permission(ctx.current_user(), PermissionReq::Otlp)
            .context(AuthSnafu)?;
+
+        let interceptor_ref = self
+            .plugins
+            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
+        interceptor_ref.pre_execute(ctx.clone())?;
+
        let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request)?;
        let _ = self
            .handle_row_inserts(requests, ctx)
@@ -59,6 +68,7 @@ impl OpenTelemetryProtocolHandler for Instance {
        Ok(resp)
    }

+    #[tracing::instrument(skip_all)]
    async fn traces(
        &self,
        request: ExportTraceServiceRequest,
@@ -70,6 +80,11 @@ impl OpenTelemetryProtocolHandler for Instance {
            .check_permission(ctx.current_user(), PermissionReq::Otlp)
            .context(AuthSnafu)?;

+        let interceptor_ref = self
+            .plugins
+            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
+        interceptor_ref.pre_execute(ctx.clone())?;
+
        let (table_name, spans) = match self.plugins.get::<TraceParserRef>() {
            Some(parser) => (parser.table_name(), parser.parse(request)),
            None => (
--- a/src/frontend/src/instance/prom_store.rs
+++ b/src/frontend/src/instance/prom_store.rs
@@ -16,19 +16,22 @@ use std::sync::Arc;

 use api::prom_store::remote::read_request::ResponseType;
 use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse, WriteRequest};
+use api::v1::RowInsertRequests;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
+use client::OutputData;
 use common_catalog::format_full_table_name;
 use common_error::ext::BoxedError;
 use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
 use common_query::Output;
 use common_recordbatch::RecordBatches;
-use common_telemetry::logging;
+use common_telemetry::{logging, tracing};
 use operator::insert::InserterRef;
 use operator::statement::StatementExecutor;
 use prost::Message;
 use servers::error::{self, AuthSnafu, Result as ServerResult};
 use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
+use servers::interceptor::{PromStoreProtocolInterceptor, PromStoreProtocolInterceptorRef};
 use servers::prom_store::{self, Metrics};
 use servers::query_handler::{
    PromStoreProtocolHandler, PromStoreProtocolHandlerRef, PromStoreResponse,
@@ -75,7 +78,7 @@ fn negotiate_response_type(accepted_response_types: &[i32]) -> ServerResult<Resp
 }

 async fn to_query_result(table_name: &str, output: Output) -> ServerResult<QueryResult> {
-    let Output::Stream(stream, _) = output else {
+    let OutputData::Stream(stream) = output.data else {
        unreachable!()
    };
    let recordbatches = RecordBatches::try_collect(stream)
@@ -87,6 +90,7 @@ async fn to_query_result(table_name: &str, output: Output) -> ServerResult<Query
 }

 impl Instance {
+    #[tracing::instrument(skip_all)]
    async fn handle_remote_query(
        &self,
        ctx: &QueryContextRef,
@@ -126,6 +130,7 @@ impl Instance {
            .context(ExecLogicalPlanSnafu)
    }

+    #[tracing::instrument(skip_all)]
    async fn handle_remote_queries(
        &self,
        ctx: QueryContextRef,
@@ -166,8 +171,12 @@ impl PromStoreProtocolHandler for Instance {
            .as_ref()
            .check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
            .context(AuthSnafu)?;
+        let interceptor_ref = self
+            .plugins
+            .get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
+        interceptor_ref.pre_write(&request, ctx.clone())?;

-        let (requests, samples) = prom_store::to_grpc_row_insert_requests(request)?;
+        let (requests, samples) = prom_store::to_grpc_row_insert_requests(&request)?;
        if with_metric_engine {
            let physical_table = ctx
                .extension(PHYSICAL_TABLE_PARAM)
@@ -190,6 +199,38 @@ impl PromStoreProtocolHandler for Instance {
        Ok(())
    }

+    async fn write_fast(
+        &self,
+        request: RowInsertRequests,
+        ctx: QueryContextRef,
+        with_metric_engine: bool,
+    ) -> ServerResult<()> {
+        self.plugins
+            .get::<PermissionCheckerRef>()
+            .as_ref()
+            .check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
+            .context(AuthSnafu)?;
+
+        if with_metric_engine {
+            let physical_table = ctx
+                .extension(PHYSICAL_TABLE_PARAM)
+                .unwrap_or(GREPTIME_PHYSICAL_TABLE)
+                .to_string();
+            let _ = self
+                .handle_metric_row_inserts(request, ctx.clone(), physical_table.to_string())
+                .await
+                .map_err(BoxedError::new)
+                .context(error::ExecuteGrpcQuerySnafu)?;
+        } else {
+            let _ = self
+                .handle_row_inserts(request, ctx.clone())
+                .await
+                .map_err(BoxedError::new)
+                .context(error::ExecuteGrpcQuerySnafu)?;
+        }
+        Ok(())
+    }
+
    async fn read(
        &self,
        request: ReadRequest,
@@ -200,6 +241,10 @@ impl PromStoreProtocolHandler for Instance {
            .as_ref()
            .check_permission(ctx.current_user(), PermissionReq::PromStoreRead)
            .context(AuthSnafu)?;
+        let interceptor_ref = self
+            .plugins
+            .get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
+        interceptor_ref.pre_read(&request, ctx.clone())?;

        let response_type = negotiate_response_type(&request.accepted_response_types)?;

@@ -265,7 +310,7 @@ impl PromStoreProtocolHandler for ExportMetricHandler {
        ctx: QueryContextRef,
        _: bool,
    ) -> ServerResult<()> {
-        let (requests, _) = prom_store::to_grpc_row_insert_requests(request)?;
+        let (requests, _) = prom_store::to_grpc_row_insert_requests(&request)?;
        self.inserter
            .handle_metric_row_inserts(
                requests,
@@ -279,6 +324,15 @@ impl PromStoreProtocolHandler for ExportMetricHandler {
        Ok(())
    }

+    async fn write_fast(
+        &self,
+        _request: RowInsertRequests,
+        _ctx: QueryContextRef,
+        _with_metric_engine: bool,
+    ) -> ServerResult<()> {
+        unimplemented!()
+    }
+
    async fn read(
        &self,
        _request: ReadRequest,
--- a/src/frontend/src/instance/script.rs
+++ b/src/frontend/src/instance/script.rs
@@ -16,6 +16,8 @@ use std::collections::HashMap;

 use async_trait::async_trait;
 use common_query::Output;
+use servers::error::Error;
+use servers::interceptor::{ScriptInterceptor, ScriptInterceptorRef};
 use servers::query_handler::ScriptHandler;
 use session::context::QueryContextRef;

@@ -30,7 +32,10 @@ impl ScriptHandler for Instance {
        name: &str,
        script: &str,
    ) -> servers::error::Result<()> {
-        let _timer = metrics::METRIC_HANDLE_SCRIPTS_ELAPSED.start_timer();
+        let interceptor_ref = self.plugins.get::<ScriptInterceptorRef<Error>>();
+        interceptor_ref.pre_execute(name, query_ctx.clone())?;
+
+        let _timer = metrics::INSERT_SCRIPTS_ELAPSED.start_timer();
        self.script_executor
            .insert_script(query_ctx, name, script)
            .await
@@ -42,7 +47,10 @@ impl ScriptHandler for Instance {
        name: &str,
        params: HashMap<String, String>,
    ) -> servers::error::Result<Output> {
-        let _timer = metrics::METRIC_RUN_SCRIPT_ELAPSED.start_timer();
+        let interceptor_ref = self.plugins.get::<ScriptInterceptorRef<Error>>();
+        interceptor_ref.pre_execute(name, query_ctx.clone())?;
+
+        let _timer = metrics::EXECUTE_SCRIPT_ELAPSED.start_timer();
        self.script_executor
            .execute_script(query_ctx, name, params)
            .await
--- a/src/frontend/src/metrics.rs
+++ b/src/frontend/src/metrics.rs
@@ -16,22 +16,32 @@ use lazy_static::lazy_static;
 use prometheus::*;

 lazy_static! {
-    pub static ref METRIC_HANDLE_SQL_ELAPSED: Histogram =
-        register_histogram!("greptime_frontend_handle_sql_elapsed", "frontend handle sql elapsed").unwrap();
-    pub static ref METRIC_HANDLE_PROMQL_ELAPSED: Histogram = register_histogram!(
-        "greptime_frontend_handle_promql_elapsed",
-        "frontend handle promql elapsed"
+    /// Timer of handling query in RPC handler.
+    pub static ref GRPC_HANDLE_QUERY_ELAPSED: HistogramVec = register_histogram_vec!(
+        "greptime_frontend_grpc_handle_query_elapsed",
+        "Elapsed time of handling queries in RPC handler",
+        &["type"],
+        vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
    )
    .unwrap();
-    pub static ref METRIC_EXEC_PLAN_ELAPSED: Histogram =
-        register_histogram!("greptime_frontend_exec_plan_elapsed", "frontend exec plan elapsed").unwrap();
-    pub static ref METRIC_HANDLE_SCRIPTS_ELAPSED: Histogram = register_histogram!(
-        "greptime_frontend_handle_scripts_elapsed",
-        "frontend handle scripts elapsed"
+    pub static ref GRPC_HANDLE_SQL_ELAPSED: Histogram = GRPC_HANDLE_QUERY_ELAPSED
+        .with_label_values(&["sql"]);
+    pub static ref GRPC_HANDLE_PROMQL_ELAPSED: Histogram = GRPC_HANDLE_QUERY_ELAPSED
+        .with_label_values(&["promql"]);
+
+    /// Timer of handling scripts in the script handler.
+    pub static ref HANDLE_SCRIPT_ELAPSED: HistogramVec = register_histogram_vec!(
+        "greptime_frontend_handle_script_elapsed",
+        "Elapsed time of handling scripts in the script handler",
+        &["type"],
+        vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
    )
    .unwrap();
-    pub static ref METRIC_RUN_SCRIPT_ELAPSED: Histogram =
-        register_histogram!("greptime_frontend_run_script_elapsed", "frontend run script elapsed").unwrap();
+    pub static ref INSERT_SCRIPTS_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
+        .with_label_values(&["insert"]);
+    pub static ref EXECUTE_SCRIPT_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
+        .with_label_values(&["execute"]);
+
    /// The samples count of Prometheus remote write.
    pub static ref PROM_STORE_REMOTE_WRITE_SAMPLES: IntCounter = register_int_counter!(
        "greptime_frontend_prometheus_remote_write_samples",
--- a/src/log-store/src/raft_engine/backend.rs
+++ b/src/log-store/src/raft_engine/backend.rs
@@ -152,6 +152,10 @@ impl TxnService for RaftEngineBackend {
            responses,
        })
    }
+
+    fn max_txn_ops(&self) -> usize {
+        usize::MAX
+    }
 }

 #[async_trait::async_trait]
--- a/src/meta-srv/examples/kv_store.rs
+++ b/src/meta-srv/examples/kv_store.rs
@@ -24,7 +24,9 @@ fn main() {

 #[tokio::main]
 async fn run() {
-    let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"]).await.unwrap();
+    let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"], 128)
+        .await
+        .unwrap();

    // put
    let put_req = PutRequest {
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -193,7 +193,8 @@ pub async fn metasrv_builder(
        (None, false) => {
            let etcd_client = create_etcd_client(opts).await?;
            let kv_backend = {
-                let etcd_backend = EtcdStore::with_etcd_client(etcd_client.clone());
+                let etcd_backend =
+                    EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
                if !opts.store_key_prefix.is_empty() {
                    Arc::new(ChrootKvBackend::new(
                        opts.store_key_prefix.clone().into_bytes(),
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -79,6 +79,17 @@ pub struct MetaSrvOptions {
    pub wal: MetaSrvWalConfig,
    pub export_metrics: ExportMetricsOption,
    pub store_key_prefix: String,
+    /// The max operations per txn
+    ///
+    /// This value is usually limited by which store is used for the `KvBackend`.
+    /// For example, if using etcd, this value should ensure that it is less than
+    /// or equal to the `--max-txn-ops` option value of etcd.
+    ///
+    /// TODO(jeremy): Currently, this option only affects the etcd store, but it may
+    /// also affect other stores in the future. In other words, each store needs to
+    /// limit the number of operations in a txn because an infinitely large txn could
+    /// potentially block other operations.
+    pub max_txn_ops: usize,
 }

 impl MetaSrvOptions {
@@ -112,6 +123,7 @@ impl Default for MetaSrvOptions {
            wal: MetaSrvWalConfig::default(),
            export_metrics: ExportMetricsOption::default(),
            store_key_prefix: String::new(),
+            max_txn_ops: 128,
        }
    }
 }
--- a/src/meta-srv/src/mocks.rs
+++ b/src/meta-srv/src/mocks.rs
@@ -42,7 +42,7 @@ pub async fn mock_with_memstore() -> MockInfo {
 }

 pub async fn mock_with_etcdstore(addr: &str) -> MockInfo {
-    let kv_backend = EtcdStore::with_endpoints([addr]).await.unwrap();
+    let kv_backend = EtcdStore::with_endpoints([addr], 128).await.unwrap();
    mock(Default::default(), kv_backend, None, None).await
 }

--- a/src/meta-srv/src/service/store/cached_kv.rs
+++ b/src/meta-srv/src/service/store/cached_kv.rs
@@ -380,6 +380,10 @@ impl TxnService for LeaderCachedKvBackend {

        Ok(res)
    }
+
+    fn max_txn_ops(&self) -> usize {
+        self.store.max_txn_ops()
+    }
 }

 impl ResettableKvBackend for LeaderCachedKvBackend {
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -76,7 +76,9 @@ common-test-util.workspace = true
 criterion = "0.4"
 log-store.workspace = true
 rand.workspace = true
+toml.workspace = true

 [[bench]]
-name = "bench_merge_tree"
+name = "memtable_bench"
 harness = false
+required-features = ["test"]
--- a/src/mito2/README.md
+++ b/src/mito2/README.md
@@ -7,3 +7,9 @@ The Alfa Romeo [MiTo](https://en.wikipedia.org/wiki/Alfa_Romeo_MiTo) is a front-

 > "You can't be a true petrolhead until you've owned an Alfa Romeo."
 > <div align="right">-- by Jeremy Clarkson</div>
+
+## Benchmarks
+Run benchmarks in this crate:
+```bash
+cargo bench -p mito2 -F test
+```
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -0,0 +1,352 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use api::v1::value::ValueData;
+use api::v1::{Row, Rows, SemanticType};
+use criterion::{criterion_group, criterion_main, Criterion};
+use datafusion_common::Column;
+use datafusion_expr::{lit, Expr};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::schema::ColumnSchema;
+use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
+use mito2::memtable::time_series::TimeSeriesMemtable;
+use mito2::memtable::{KeyValues, Memtable};
+use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
+use rand::rngs::ThreadRng;
+use rand::seq::SliceRandom;
+use rand::Rng;
+use store_api::metadata::{
+    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
+};
+use store_api::storage::RegionId;
+use table::predicate::Predicate;
+
+/// Writes rows.
+fn write_rows(c: &mut Criterion) {
+    let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
+    let timestamps = (0..100).collect::<Vec<_>>();
+
+    // Note that this test only generate one time series.
+    let mut group = c.benchmark_group("write");
+    group.bench_function("merge_tree", |b| {
+        let memtable =
+            MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
+        let kvs =
+            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
+        b.iter(|| {
+            memtable.write(&kvs).unwrap();
+        });
+    });
+    group.bench_function("time_series", |b| {
+        let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
+        let kvs =
+            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
+        b.iter(|| {
+            memtable.write(&kvs).unwrap();
+        });
+    });
+}
+
+/// Scans all rows.
+fn full_scan(c: &mut Criterion) {
+    let metadata = Arc::new(cpu_metadata());
+    let config = MergeTreeConfig::default();
+    let start_sec = 1710043200;
+    let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
+
+    let mut group = c.benchmark_group("full_scan");
+    group.sample_size(10);
+    group.bench_function("merge_tree", |b| {
+        let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
+        for kvs in generator.iter() {
+            memtable.write(&kvs).unwrap();
+        }
+
+        b.iter(|| {
+            let iter = memtable.iter(None, None).unwrap();
+            for batch in iter {
+                let _batch = batch.unwrap();
+            }
+        });
+    });
+    group.bench_function("time_series", |b| {
+        let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
+        for kvs in generator.iter() {
+            memtable.write(&kvs).unwrap();
+        }
+
+        b.iter(|| {
+            let iter = memtable.iter(None, None).unwrap();
+            for batch in iter {
+                let _batch = batch.unwrap();
+            }
+        });
+    });
+}
+
+/// Filters 1 host.
+fn filter_1_host(c: &mut Criterion) {
+    let metadata = Arc::new(cpu_metadata());
+    let config = MergeTreeConfig::default();
+    let start_sec = 1710043200;
+    let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
+
+    let mut group = c.benchmark_group("filter_1_host");
+    group.sample_size(10);
+    group.bench_function("merge_tree", |b| {
+        let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
+        for kvs in generator.iter() {
+            memtable.write(&kvs).unwrap();
+        }
+        let predicate = generator.random_host_filter();
+
+        b.iter(|| {
+            let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
+            for batch in iter {
+                let _batch = batch.unwrap();
+            }
+        });
+    });
+    group.bench_function("time_series", |b| {
+        let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
+        for kvs in generator.iter() {
+            memtable.write(&kvs).unwrap();
+        }
+        let predicate = generator.random_host_filter();
+
+        b.iter(|| {
+            let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
+            for batch in iter {
+                let _batch = batch.unwrap();
+            }
+        });
+    });
+}
+
+struct Host {
+    hostname: String,
+    region: String,
+    datacenter: String,
+    rack: String,
+    os: String,
+    arch: String,
+    team: String,
+    service: String,
+    service_version: String,
+    service_environment: String,
+}
+
+impl Host {
+    fn random_with_id(id: usize) -> Host {
+        let mut rng = rand::thread_rng();
+        let region = format!("ap-southeast-{}", rng.gen_range(0..10));
+        let datacenter = format!(
+            "{}{}",
+            region,
+            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
+        );
+        Host {
+            hostname: format!("host_{id}"),
+            region,
+            datacenter,
+            rack: rng.gen_range(0..100).to_string(),
+            os: "Ubuntu16.04LTS".to_string(),
+            arch: "x86".to_string(),
+            team: "CHI".to_string(),
+            service: rng.gen_range(0..100).to_string(),
+            service_version: rng.gen_range(0..10).to_string(),
+            service_environment: "test".to_string(),
+        }
+    }
+
+    fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
+        let tags = [
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.hostname.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.region.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.rack.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.os.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.arch.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.team.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_version.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
+            },
+        ];
+        for tag in tags {
+            values.push(tag);
+        }
+    }
+}
+
+struct CpuDataGenerator {
+    metadata: RegionMetadataRef,
+    column_schemas: Vec<api::v1::ColumnSchema>,
+    hosts: Vec<Host>,
+    start_sec: i64,
+    end_sec: i64,
+}
+
+impl CpuDataGenerator {
+    fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
+        let column_schemas = region_metadata_to_row_schema(&metadata);
+        Self {
+            metadata,
+            column_schemas,
+            hosts: Self::generate_hosts(num_hosts),
+            start_sec,
+            end_sec,
+        }
+    }
+
+    fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
+        // point per 10s.
+        (self.start_sec..self.end_sec)
+            .step_by(10)
+            .enumerate()
+            .map(|(seq, ts)| self.build_key_values(seq, ts))
+    }
+
+    fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
+        let rows = self
+            .hosts
+            .iter()
+            .map(|host| {
+                let mut rng = rand::thread_rng();
+                let mut values = Vec::with_capacity(21);
+                values.push(api::v1::Value {
+                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
+                });
+                host.fill_values(&mut values);
+                for _ in 0..10 {
+                    values.push(api::v1::Value {
+                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
+                    });
+                }
+                Row { values }
+            })
+            .collect();
+        let mutation = api::v1::Mutation {
+            op_type: api::v1::OpType::Put as i32,
+            sequence: seq as u64,
+            rows: Some(Rows {
+                schema: self.column_schemas.clone(),
+                rows,
+            }),
+        };
+
+        KeyValues::new(&self.metadata, mutation).unwrap()
+    }
+
+    fn random_host_filter(&self) -> Predicate {
+        let host = self.random_hostname();
+        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
+        Predicate::new(vec![expr.into()])
+    }
+
+    fn random_hostname(&self) -> String {
+        let mut rng = rand::thread_rng();
+        self.hosts.choose(&mut rng).unwrap().hostname.clone()
+    }
+
+    fn random_f64(rng: &mut ThreadRng) -> f64 {
+        let base: u32 = rng.gen_range(30..95);
+        base as f64
+    }
+
+    fn generate_hosts(num_hosts: usize) -> Vec<Host> {
+        (0..num_hosts).map(Host::random_with_id).collect()
+    }
+}
+
+/// Creates a metadata for TSBS cpu-like table.
+fn cpu_metadata() -> RegionMetadata {
+    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
+    builder.push_column_metadata(ColumnMetadata {
+        column_schema: ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        ),
+        semantic_type: SemanticType::Timestamp,
+        column_id: 0,
+    });
+    let mut column_id = 1;
+    let tags = [
+        "hostname",
+        "region",
+        "datacenter",
+        "rack",
+        "os",
+        "arch",
+        "team",
+        "service",
+        "service_version",
+        "service_environment",
+    ];
+    for tag in tags {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
+            semantic_type: SemanticType::Tag,
+            column_id,
+        });
+        column_id += 1;
+    }
+    let fields = [
+        "usage_user",
+        "usage_system",
+        "usage_idle",
+        "usage_nice",
+        "usage_iowait",
+        "usage_irq",
+        "usage_softirq",
+        "usage_steal",
+        "usage_guest",
+        "usage_guest_nice",
+    ];
+    for field in fields {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
+            semantic_type: SemanticType::Field,
+            column_id,
+        });
+        column_id += 1;
+    }
+    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+    builder.build().unwrap()
+}
+
+criterion_group!(benches, write_rows, full_scan, filter_1_host);
+criterion_main!(benches);
--- a/src/mito2/benches/merge_tree_bench.rs
+++ b/src/mito2/benches/merge_tree_bench.rs
@@ -1,36 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use criterion::{criterion_group, criterion_main, Criterion};
-use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
-use mito2::memtable::Memtable;
-use mito2::test_util::memtable_util;
-
-fn bench_merge_tree_memtable(c: &mut Criterion) {
-    let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
-    let timestamps = (0..100).collect::<Vec<_>>();
-
-    let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
-
-    let _ = c.bench_function("MergeTreeMemtable", |b| {
-        let kvs =
-            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
-        b.iter(|| {
-            memtable.write(&kvs).unwrap();
-        });
-    });
-}
-
-criterion_group!(benches, bench_merge_tree_memtable);
-criterion_main!(benches);
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -158,7 +158,7 @@ impl CacheManager {
        }
    }

-    /// Gets the the write cache.
+    /// Gets the write cache.
    pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
        self.write_cache.as_ref()
    }
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, NoneAsEmptyString};

 use crate::error::Result;
-use crate::memtable::merge_tree::MergeTreeConfig;
+use crate::memtable::MemtableConfig;
 use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;

 /// Default max running background job.
@@ -104,8 +104,8 @@ pub struct MitoConfig {
    /// Inverted index configs.
    pub inverted_index: InvertedIndexConfig,

-    /// Experimental memtable.
-    pub experimental_memtable: Option<MergeTreeConfig>,
+    /// Memtable config
+    pub memtable: MemtableConfig,
 }

 impl Default for MitoConfig {
@@ -131,7 +131,7 @@ impl Default for MitoConfig {
            parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
            allow_stale_entries: false,
            inverted_index: InvertedIndexConfig::default(),
-            experimental_memtable: None,
+            memtable: MemtableConfig::default(),
        };

        // Adjust buffer and cache size according to system memory if we can.
@@ -319,3 +319,25 @@ fn divide_num_cpus(divisor: usize) -> usize {

    (cores + divisor - 1) / divisor
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_deserialize_config() {
+        let s = r#"
+[memtable]
+type = "experimental"
+index_max_keys_per_shard = 8192
+data_freeze_threshold = 1024
+dedup = true
+fork_dictionary_bytes = "512MiB"
+"#;
+        let config: MitoConfig = toml::from_str(s).unwrap();
+        let MemtableConfig::Experimental(config) = &config.memtable else {
+            unreachable!()
+        };
+        assert_eq!(1024, config.data_freeze_threshold);
+    }
+}
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -47,6 +47,7 @@ mod truncate_test;

 use std::any::Any;
 use std::sync::Arc;
+use std::time::Instant;

 use async_trait::async_trait;
 use common_error::ext::BoxedError;
@@ -219,6 +220,7 @@ impl EngineInner {

    /// Handles the scan `request` and returns a [Scanner] for the `request`.
    fn handle_query(&self, region_id: RegionId, request: ScanRequest) -> Result<Scanner> {
+        let query_start = Instant::now();
        // Reading a region doesn't need to go through the region worker thread.
        let region = self
            .workers
@@ -239,7 +241,8 @@ impl EngineInner {
            Some(cache_manager),
        )
        .with_parallelism(scan_parallelism)
-        .ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled());
+        .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
+        .with_start_time(query_start);

        scan_region.scanner()
    }
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -14,16 +14,12 @@

 //! Memtables are write buffers for regions.

-pub mod key_values;
-pub mod merge_tree;
-pub mod time_series;
-pub(crate) mod version;
-
 use std::fmt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;

 use common_time::Timestamp;
+use serde::{Deserialize, Serialize};
 use store_api::metadata::RegionMetadataRef;
 use store_api::storage::ColumnId;
 use table::predicate::Predicate;
@@ -31,14 +27,34 @@ use table::predicate::Predicate;
 use crate::error::Result;
 use crate::flush::WriteBufferManagerRef;
 pub use crate::memtable::key_values::KeyValues;
+use crate::memtable::merge_tree::MergeTreeConfig;
 use crate::metrics::WRITE_BUFFER_BYTES;
 use crate::read::Batch;

+pub mod key_values;
+pub mod merge_tree;
+pub mod time_series;
+pub(crate) mod version;
+
 /// Id for memtables.
 ///
 /// Should be unique under the same region.
 pub type MemtableId = u32;

+/// Config for memtables.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum MemtableConfig {
+    Experimental(MergeTreeConfig),
+    TimeSeries,
+}
+
+impl Default for MemtableConfig {
+    fn default() -> Self {
+        Self::Experimental(MergeTreeConfig::default())
+    }
+}
+
 #[derive(Debug, Default)]
 pub struct MemtableStats {
    /// The estimated bytes allocated by this memtable from heap.
@@ -187,9 +203,30 @@ impl Drop for AllocTracker {

 #[cfg(test)]
 mod tests {
+    use common_base::readable_size::ReadableSize;
+
    use super::*;
    use crate::flush::{WriteBufferManager, WriteBufferManagerImpl};

+    #[test]
+    fn test_deserialize_memtable_config() {
+        let s = r#"
+type = "experimental"
+index_max_keys_per_shard = 8192
+data_freeze_threshold = 1024
+dedup = true
+fork_dictionary_bytes = "512MiB"
+"#;
+        let config: MemtableConfig = toml::from_str(s).unwrap();
+        let MemtableConfig::Experimental(merge_tree) = config else {
+            unreachable!()
+        };
+        assert!(merge_tree.dedup);
+        assert_eq!(8192, merge_tree.index_max_keys_per_shard);
+        assert_eq!(1024, merge_tree.data_freeze_threshold);
+        assert_eq!(ReadableSize::mb(512), merge_tree.fork_dictionary_bytes);
+    }
+
    #[test]
    fn test_alloc_tracker_without_manager() {
        let tracker = AllocTracker::new(None);
--- a/src/mito2/src/memtable/merge_tree.rs
+++ b/src/mito2/src/memtable/merge_tree.rs
@@ -44,7 +44,7 @@ use crate::memtable::{
 };

 /// Use `1/DICTIONARY_SIZE_FACTOR` of OS memory as dictionary size.
-const DICTIONARY_SIZE_FACTOR: u64 = 16;
+const DICTIONARY_SIZE_FACTOR: u64 = 8;

 /// Id of a shard, only unique inside a partition.
 type ShardId = u32;
@@ -74,7 +74,7 @@ pub struct MergeTreeConfig {

 impl Default for MergeTreeConfig {
    fn default() -> Self {
-        let mut fork_dictionary_bytes = ReadableSize::mb(512);
+        let mut fork_dictionary_bytes = ReadableSize::gb(1);
        if let Some(sys_memory) = common_config::utils::get_sys_total_memory() {
            let adjust_dictionary_bytes =
                std::cmp::min(sys_memory / DICTIONARY_SIZE_FACTOR, fork_dictionary_bytes);
@@ -85,7 +85,7 @@ impl Default for MergeTreeConfig {

        Self {
            index_max_keys_per_shard: 8192,
-            data_freeze_threshold: 102400,
+            data_freeze_threshold: 131072,
            dedup: true,
            fork_dictionary_bytes,
        }
@@ -293,6 +293,8 @@ mod tests {
    use std::collections::BTreeSet;

    use common_time::Timestamp;
+    use datafusion_common::{Column, ScalarValue};
+    use datafusion_expr::{BinaryExpr, Expr, Operator};
    use datatypes::scalars::ScalarVector;
    use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};

@@ -528,4 +530,55 @@ mod tests {
            .collect::<Vec<_>>();
        assert_eq!(expect, read);
    }
+
+    #[test]
+    fn test_memtable_filter() {
+        let metadata = memtable_util::metadata_with_primary_key(vec![0, 1], false);
+        // Try to build a memtable via the builder.
+        let memtable = MergeTreeMemtableBuilder::new(
+            MergeTreeConfig {
+                index_max_keys_per_shard: 40,
+                ..Default::default()
+            },
+            None,
+        )
+        .build(1, &metadata);
+
+        for i in 0..100 {
+            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
+            let kvs =
+                memtable_util::build_key_values(&metadata, "hello".to_string(), i, &timestamps, 1);
+            memtable.write(&kvs).unwrap();
+        }
+
+        for i in 0..100 {
+            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
+            let expr = Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(Expr::Column(Column {
+                    relation: None,
+                    name: "k1".to_string(),
+                })),
+                op: Operator::Eq,
+                right: Box::new(Expr::Literal(ScalarValue::UInt32(Some(i)))),
+            });
+            let iter = memtable
+                .iter(None, Some(Predicate::new(vec![expr.into()])))
+                .unwrap();
+            let read = iter
+                .flat_map(|batch| {
+                    batch
+                        .unwrap()
+                        .timestamps()
+                        .as_any()
+                        .downcast_ref::<TimestampMillisecondVector>()
+                        .unwrap()
+                        .iter_data()
+                        .collect::<Vec<_>>()
+                        .into_iter()
+                })
+                .map(|v| v.unwrap().0.value())
+                .collect::<Vec<_>>();
+            assert_eq!(timestamps, read);
+        }
+    }
 }
--- a/src/mito2/src/memtable/merge_tree/data.rs
+++ b/src/mito2/src/memtable/merge_tree/data.rs
@@ -957,6 +957,18 @@ impl DataParts {
        self.active.write_row(pk_index, kv)
    }

+    /// Returns the number of rows in the active buffer.
+    pub fn num_active_rows(&self) -> usize {
+        self.active.num_rows()
+    }
+
+    /// Freezes active buffer and creates a new active buffer.
+    pub fn freeze(&mut self) -> Result<()> {
+        let part = self.active.freeze(None, false)?;
+        self.frozen.push(part);
+        Ok(())
+    }
+
    /// Reads data from all parts including active and frozen parts.
    /// The returned iterator yields a record batch of one primary key at a time.
    /// The order of yielding primary keys is determined by provided weights.
@@ -976,6 +988,11 @@ impl DataParts {
    pub(crate) fn is_empty(&self) -> bool {
        self.active.is_empty() && self.frozen.iter().all(|part| part.is_empty())
    }
+
+    #[cfg(test)]
+    pub(crate) fn frozen_len(&self) -> usize {
+        self.frozen.len()
+    }
 }

 pub struct DataPartsReaderBuilder {
@@ -994,9 +1011,11 @@ impl DataPartsReaderBuilder {
        for p in self.parts {
            nodes.push(DataNode::new(DataSource::Part(p)));
        }
+        let num_parts = nodes.len();
        let merger = Merger::try_new(nodes)?;
        Ok(DataPartsReader {
            merger,
+            num_parts,
            elapsed: Default::default(),
        })
    }
@@ -1005,6 +1024,7 @@ impl DataPartsReaderBuilder {
 /// Reader for all parts inside a `DataParts`.
 pub struct DataPartsReader {
    merger: Merger<DataNode>,
+    num_parts: usize,
    elapsed: Duration,
 }

@@ -1032,6 +1052,10 @@ impl DataPartsReader {
    pub(crate) fn is_valid(&self) -> bool {
        self.merger.is_valid()
    }
+
+    pub(crate) fn num_parts(&self) -> usize {
+        self.num_parts
+    }
 }

 #[cfg(test)]
--- a/src/mito2/src/memtable/merge_tree/dedup.rs
+++ b/src/mito2/src/memtable/merge_tree/dedup.rs
@@ -45,7 +45,7 @@ impl<T: DataBatchSource> DataBatchSource for DedupReader<T> {
    }

    fn next(&mut self) -> Result<()> {
-        loop {
+        while self.inner.is_valid() {
            match &mut self.prev_batch_last_row {
                None => {
                    // First shot, fill prev_batch_last_row and current_batch_range with first batch.
--- a/src/mito2/src/memtable/merge_tree/partition.rs
+++ b/src/mito2/src/memtable/merge_tree/partition.rs
@@ -78,7 +78,7 @@ impl Partition {

        // Finds key in shards, now we ensure one key only exists in one shard.
        if let Some(pk_id) = inner.find_key_in_shards(primary_key) {
-            inner.write_to_shard(pk_id, &key_value);
+            inner.write_to_shard(pk_id, &key_value)?;
            inner.num_rows += 1;
            return Ok(());
        }
@@ -106,7 +106,7 @@ impl Partition {
    }

    /// Writes to the partition without a primary key.
-    pub fn write_no_key(&self, key_value: KeyValue) {
+    pub fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
        let mut inner = self.inner.write().unwrap();
        // If no primary key, always write to the first shard.
        debug_assert!(!inner.shards.is_empty());
@@ -117,12 +117,24 @@ impl Partition {
            shard_id: 0,
            pk_index: 0,
        };
-        inner.shards[0].write_with_pk_id(pk_id, &key_value);
+        inner.shards[0].write_with_pk_id(pk_id, &key_value)?;
        inner.num_rows += 1;
+
+        Ok(())
    }

    /// Scans data in the partition.
    pub fn read(&self, mut context: ReadPartitionContext) -> Result<PartitionReader> {
+        let start = Instant::now();
+        let key_filter = if context.need_prune_key {
+            Some(PrimaryKeyFilter::new(
+                context.metadata.clone(),
+                context.filters.clone(),
+                context.row_codec.clone(),
+            ))
+        } else {
+            None
+        };
        let (builder_source, shard_reader_builders) = {
            let inner = self.inner.read().unwrap();
            let mut shard_source = Vec::with_capacity(inner.shards.len() + 1);
@@ -141,14 +153,21 @@ impl Partition {
            (builder_reader, shard_source)
        };

+        context.metrics.num_shards += shard_reader_builders.len();
        let mut nodes = shard_reader_builders
            .into_iter()
-            .map(|builder| Ok(ShardNode::new(ShardSource::Shard(builder.build()?))))
+            .map(|builder| {
+                Ok(ShardNode::new(ShardSource::Shard(
+                    builder.build(key_filter.clone())?,
+                )))
+            })
            .collect::<Result<Vec<_>>>()?;

        if let Some(builder) = builder_source {
+            context.metrics.num_builder += 1;
            // Move the initialization of ShardBuilderReader out of read lock.
-            let shard_builder_reader = builder.build(Some(&context.pk_weights))?;
+            let shard_builder_reader =
+                builder.build(Some(&context.pk_weights), key_filter.clone())?;
            nodes.push(ShardNode::new(ShardSource::Builder(shard_builder_reader)));
        }

@@ -156,8 +175,10 @@ impl Partition {
        let merger = ShardMerger::try_new(nodes)?;
        if self.dedup {
            let source = DedupReader::try_new(merger)?;
+            context.metrics.build_partition_reader += start.elapsed();
            PartitionReader::new(context, Box::new(source))
        } else {
+            context.metrics.build_partition_reader += start.elapsed();
            PartitionReader::new(context, Box::new(merger))
        }
    }
@@ -266,11 +287,11 @@ pub(crate) struct PartitionStats {

 #[derive(Default)]
 struct PartitionReaderMetrics {
-    prune_pk: Duration,
+    build_partition_reader: Duration,
    read_source: Duration,
    data_batch_to_batch: Duration,
-    keys_before_pruning: usize,
-    keys_after_pruning: usize,
+    num_builder: usize,
+    num_shards: usize,
 }

 /// Reader to scan rows in a partition.
@@ -279,18 +300,11 @@ struct PartitionReaderMetrics {
 pub struct PartitionReader {
    context: ReadPartitionContext,
    source: BoxedDataBatchSource,
-    last_yield_pk_id: Option<PkId>,
 }

 impl PartitionReader {
    fn new(context: ReadPartitionContext, source: BoxedDataBatchSource) -> Result<Self> {
-        let mut reader = Self {
-            context,
-            source,
-            last_yield_pk_id: None,
-        };
-        // Find next valid batch.
-        reader.prune_batch_by_key()?;
+        let reader = Self { context, source };

        Ok(reader)
    }
@@ -305,8 +319,7 @@ impl PartitionReader {
    /// # Panics
    /// Panics if the reader is invalid.
    pub fn next(&mut self) -> Result<()> {
-        self.advance_source()?;
-        self.prune_batch_by_key()
+        self.advance_source()
    }

    /// Converts current data batch into a [Batch].
@@ -336,106 +349,77 @@ impl PartitionReader {
        self.context.metrics.read_source += read_source.elapsed();
        Ok(())
    }
-
-    fn prune_batch_by_key(&mut self) -> Result<()> {
-        if self.context.metadata.primary_key.is_empty() || !self.context.need_prune_key {
-            // Nothing to prune.
-            return Ok(());
-        }
-
-        while self.source.is_valid() {
-            let pk_id = self.source.current_pk_id();
-            if let Some(yield_pk_id) = self.last_yield_pk_id {
-                if pk_id == yield_pk_id {
-                    // If this batch has the same key as last returned batch.
-                    // We can return it without evaluating filters.
-                    break;
-                }
-            }
-            let key = self.source.current_key().unwrap();
-            self.context.metrics.keys_before_pruning += 1;
-            // Prune batch by primary key.
-            if prune_primary_key(
-                &self.context.metadata,
-                &self.context.filters,
-                &self.context.row_codec,
-                key,
-                &mut self.context.metrics,
-            ) {
-                // We need this key.
-                self.last_yield_pk_id = Some(pk_id);
-                self.context.metrics.keys_after_pruning += 1;
-                break;
-            }
-            self.advance_source()?;
-        }
-        Ok(())
-    }
 }

-fn prune_primary_key(
-    metadata: &RegionMetadataRef,
-    filters: &[SimpleFilterEvaluator],
-    codec: &McmpRowCodec,
-    pk: &[u8],
-    metrics: &mut PartitionReaderMetrics,
-) -> bool {
-    let start = Instant::now();
-    let res = prune_primary_key_inner(metadata, filters, codec, pk);
-    metrics.prune_pk += start.elapsed();
-    res
+#[derive(Clone)]
+pub(crate) struct PrimaryKeyFilter {
+    metadata: RegionMetadataRef,
+    filters: Arc<Vec<SimpleFilterEvaluator>>,
+    codec: Arc<McmpRowCodec>,
+    offsets_buf: Vec<usize>,
 }

-// TODO(yingwen): Improve performance of key pruning. Now we need to find index and
-// then decode and convert each value.
-/// Returns true if the `pk` is still needed.
-fn prune_primary_key_inner(
-    metadata: &RegionMetadataRef,
-    filters: &[SimpleFilterEvaluator],
-    codec: &McmpRowCodec,
-    pk: &[u8],
-) -> bool {
-    if filters.is_empty() {
-        return true;
+impl PrimaryKeyFilter {
+    pub(crate) fn new(
+        metadata: RegionMetadataRef,
+        filters: Arc<Vec<SimpleFilterEvaluator>>,
+        codec: Arc<McmpRowCodec>,
+    ) -> Self {
+        Self {
+            metadata,
+            filters,
+            codec,
+            offsets_buf: Vec::new(),
+        }
    }

-    // no primary key, we simply return true.
-    if metadata.primary_key.is_empty() {
-        return true;
-    }
-
-    let pk_values = match codec.decode(pk) {
-        Ok(values) => values,
-        Err(e) => {
-            common_telemetry::error!(e; "Failed to decode primary key");
+    pub(crate) fn prune_primary_key(&mut self, pk: &[u8]) -> bool {
+        if self.filters.is_empty() {
            return true;
        }
-    };

-    // evaluate filters against primary key values
-    let mut result = true;
-    for filter in filters {
-        if Partition::is_partition_column(filter.column_name()) {
-            continue;
+        // no primary key, we simply return true.
+        if self.metadata.primary_key.is_empty() {
+            return true;
        }
-        let Some(column) = metadata.column_by_name(filter.column_name()) else {
-            continue;
-        };
-        // ignore filters that are not referencing primary key columns
-        if column.semantic_type != SemanticType::Tag {
-            continue;
+
+        // evaluate filters against primary key values
+        let mut result = true;
+        self.offsets_buf.clear();
+        for filter in &*self.filters {
+            if Partition::is_partition_column(filter.column_name()) {
+                continue;
+            }
+            let Some(column) = self.metadata.column_by_name(filter.column_name()) else {
+                continue;
+            };
+            // ignore filters that are not referencing primary key columns
+            if column.semantic_type != SemanticType::Tag {
+                continue;
+            }
+            // index of the column in primary keys.
+            // Safety: A tag column is always in primary key.
+            let index = self.metadata.primary_key_index(column.column_id).unwrap();
+            let value = match self.codec.decode_value_at(pk, index, &mut self.offsets_buf) {
+                Ok(v) => v,
+                Err(e) => {
+                    common_telemetry::error!(e; "Failed to decode primary key");
+                    return true;
+                }
+            };
+
+            // TODO(yingwen): `evaluate_scalar()` creates temporary arrays to compare scalars. We
+            // can compare the bytes directly without allocation and matching types as we use
+            // comparable encoding.
+            // Safety: arrow schema and datatypes are constructed from the same source.
+            let scalar_value = value
+                .try_to_scalar_value(&column.column_schema.data_type)
+                .unwrap();
+            result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
        }
-        // index of the column in primary keys.
-        // Safety: A tag column is always in primary key.
-        let index = metadata.primary_key_index(column.column_id).unwrap();
-        // Safety: arrow schema and datatypes are constructed from the same source.
-        let scalar_value = pk_values[index]
-            .try_to_scalar_value(&column.column_schema.data_type)
-            .unwrap();
-        result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
+
+        result
    }
-
-    result
 }

 /// Structs to reuse across readers to avoid allocating for each reader.
@@ -443,7 +427,7 @@ pub(crate) struct ReadPartitionContext {
    metadata: RegionMetadataRef,
    row_codec: Arc<McmpRowCodec>,
    projection: HashSet<ColumnId>,
-    filters: Vec<SimpleFilterEvaluator>,
+    filters: Arc<Vec<SimpleFilterEvaluator>>,
    /// Buffer to store pk weights.
    pk_weights: Vec<u16>,
    need_prune_key: bool,
@@ -452,10 +436,6 @@ pub(crate) struct ReadPartitionContext {

 impl Drop for ReadPartitionContext {
    fn drop(&mut self) {
-        let partition_prune_pk = self.metrics.prune_pk.as_secs_f64();
-        MERGE_TREE_READ_STAGE_ELAPSED
-            .with_label_values(&["partition_prune_pk"])
-            .observe(partition_prune_pk);
        let partition_read_source = self.metrics.read_source.as_secs_f64();
        MERGE_TREE_READ_STAGE_ELAPSED
            .with_label_values(&["partition_read_source"])
@@ -465,16 +445,19 @@ impl Drop for ReadPartitionContext {
            .with_label_values(&["partition_data_batch_to_batch"])
            .observe(partition_data_batch_to_batch);

-        if self.metrics.keys_before_pruning != 0 {
-            common_telemetry::debug!(
-                "TreeIter pruning, before: {}, after: {}, partition_read_source: {}s, partition_prune_pk: {}s, partition_data_batch_to_batch: {}s",
-                self.metrics.keys_before_pruning,
-                self.metrics.keys_after_pruning,
-                partition_read_source,
-                partition_prune_pk,
-                partition_data_batch_to_batch,
-            );
-        }
+        common_telemetry::debug!(
+            "TreeIter partitions metrics, \
+            num_builder: {}, \
+            num_shards: {}, \
+            build_partition_reader: {}s, \
+            partition_read_source: {}s, \
+            partition_data_batch_to_batch: {}s",
+            self.metrics.num_builder,
+            self.metrics.num_shards,
+            self.metrics.build_partition_reader.as_secs_f64(),
+            partition_read_source,
+            partition_data_batch_to_batch,
+        );
    }
 }

@@ -490,7 +473,7 @@ impl ReadPartitionContext {
            metadata,
            row_codec,
            projection,
-            filters,
+            filters: Arc::new(filters),
            pk_weights: Vec::new(),
            need_prune_key,
            metrics: Default::default(),
@@ -578,7 +561,16 @@ impl Inner {
    fn new(metadata: RegionMetadataRef, config: &MergeTreeConfig) -> Self {
        let (shards, current_shard_id) = if metadata.primary_key.is_empty() {
            let data_parts = DataParts::new(metadata.clone(), DATA_INIT_CAP, config.dedup);
-            (vec![Shard::new(0, None, data_parts, config.dedup)], 1)
+            (
+                vec![Shard::new(
+                    0,
+                    None,
+                    data_parts,
+                    config.dedup,
+                    config.data_freeze_threshold,
+                )],
+                1,
+            )
        } else {
            (Vec::new(), 0)
        };
@@ -598,18 +590,22 @@ impl Inner {
        self.pk_to_pk_id.get(primary_key).copied()
    }

-    fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) {
+    fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
        if pk_id.shard_id == self.shard_builder.current_shard_id() {
            self.shard_builder.write_with_pk_id(pk_id, key_value);
-            return;
-        }
-        for shard in &mut self.shards {
-            if shard.shard_id == pk_id.shard_id {
-                shard.write_with_pk_id(pk_id, key_value);
-                self.num_rows += 1;
-                return;
-            }
+            return Ok(());
        }
+
+        // Safety: We find the shard by shard id.
+        let shard = self
+            .shards
+            .iter_mut()
+            .find(|shard| shard.shard_id == pk_id.shard_id)
+            .unwrap();
+        shard.write_with_pk_id(pk_id, key_value)?;
+        self.num_rows += 1;
+
+        Ok(())
    }

    fn freeze_active_shard(&mut self) -> Result<()> {
--- a/src/mito2/src/memtable/merge_tree/shard.rs
+++ b/src/mito2/src/memtable/merge_tree/shard.rs
@@ -15,6 +15,7 @@
 //! Shard in a partition.

 use std::cmp::Ordering;
+use std::time::{Duration, Instant};

 use store_api::metadata::RegionMetadataRef;

@@ -25,8 +26,10 @@ use crate::memtable::merge_tree::data::{
 };
 use crate::memtable::merge_tree::dict::KeyDictRef;
 use crate::memtable::merge_tree::merger::{Merger, Node};
+use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
 use crate::memtable::merge_tree::shard_builder::ShardBuilderReader;
-use crate::memtable::merge_tree::{PkId, ShardId};
+use crate::memtable::merge_tree::{PkId, PkIndex, ShardId};
+use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;

 /// Shard stores data related to the same key dictionary.
 pub struct Shard {
@@ -36,6 +39,8 @@ pub struct Shard {
    /// Data in the shard.
    data_parts: DataParts,
    dedup: bool,
+    /// Number of rows to freeze a data part.
+    data_freeze_threshold: usize,
 }

 impl Shard {
@@ -45,20 +50,29 @@ impl Shard {
        key_dict: Option<KeyDictRef>,
        data_parts: DataParts,
        dedup: bool,
+        data_freeze_threshold: usize,
    ) -> Shard {
        Shard {
            shard_id,
            key_dict,
            data_parts,
            dedup,
+            data_freeze_threshold,
        }
    }

    /// Writes a key value into the shard.
-    pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) {
+    ///
+    /// It will freezes the active buffer if it is full.
+    pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
        debug_assert_eq!(self.shard_id, pk_id.shard_id);

+        if self.data_parts.num_active_rows() >= self.data_freeze_threshold {
+            self.data_parts.freeze()?;
+        }
+
        self.data_parts.write_row(pk_id.pk_index, key_value);
+        Ok(())
    }

    /// Scans the shard.
@@ -80,6 +94,7 @@ impl Shard {
            key_dict: self.key_dict.clone(),
            data_parts: DataParts::new(metadata, DATA_INIT_CAP, self.dedup),
            dedup: self.dedup,
+            data_freeze_threshold: self.data_freeze_threshold,
        }
    }

@@ -131,18 +146,15 @@ pub struct ShardReaderBuilder {
 }

 impl ShardReaderBuilder {
-    pub(crate) fn build(self) -> Result<ShardReader> {
+    pub(crate) fn build(self, key_filter: Option<PrimaryKeyFilter>) -> Result<ShardReader> {
        let ShardReaderBuilder {
            shard_id,
            key_dict,
            inner,
        } = self;
+        let now = Instant::now();
        let parts_reader = inner.build()?;
-        Ok(ShardReader {
-            shard_id,
-            key_dict,
-            parts_reader,
-        })
+        ShardReader::new(shard_id, key_dict, parts_reader, key_filter, now.elapsed())
    }
 }

@@ -151,15 +163,46 @@ pub struct ShardReader {
    shard_id: ShardId,
    key_dict: Option<KeyDictRef>,
    parts_reader: DataPartsReader,
+    key_filter: Option<PrimaryKeyFilter>,
+    last_yield_pk_index: Option<PkIndex>,
+    keys_before_pruning: usize,
+    keys_after_pruning: usize,
+    prune_pk_cost: Duration,
+    data_build_cost: Duration,
 }

 impl ShardReader {
+    fn new(
+        shard_id: ShardId,
+        key_dict: Option<KeyDictRef>,
+        parts_reader: DataPartsReader,
+        key_filter: Option<PrimaryKeyFilter>,
+        data_build_cost: Duration,
+    ) -> Result<Self> {
+        let has_pk = key_dict.is_some();
+        let mut reader = Self {
+            shard_id,
+            key_dict,
+            parts_reader,
+            key_filter: if has_pk { key_filter } else { None },
+            last_yield_pk_index: None,
+            keys_before_pruning: 0,
+            keys_after_pruning: 0,
+            prune_pk_cost: Duration::default(),
+            data_build_cost,
+        };
+        reader.prune_batch_by_key()?;
+
+        Ok(reader)
+    }
+
    fn is_valid(&self) -> bool {
        self.parts_reader.is_valid()
    }

    fn next(&mut self) -> Result<()> {
-        self.parts_reader.next()
+        self.parts_reader.next()?;
+        self.prune_batch_by_key()
    }

    fn current_key(&self) -> Option<&[u8]> {
@@ -180,6 +223,54 @@ impl ShardReader {
    fn current_data_batch(&self) -> DataBatch {
        self.parts_reader.current_data_batch()
    }
+
+    fn prune_batch_by_key(&mut self) -> Result<()> {
+        let Some(key_filter) = &mut self.key_filter else {
+            return Ok(());
+        };
+
+        while self.parts_reader.is_valid() {
+            let pk_index = self.parts_reader.current_data_batch().pk_index();
+            if let Some(yield_pk_index) = self.last_yield_pk_index {
+                if pk_index == yield_pk_index {
+                    break;
+                }
+            }
+            self.keys_before_pruning += 1;
+            // Safety: `key_filter` is some so the shard has primary keys.
+            let key = self.key_dict.as_ref().unwrap().key_by_pk_index(pk_index);
+            let now = Instant::now();
+            if key_filter.prune_primary_key(key) {
+                self.prune_pk_cost += now.elapsed();
+                self.last_yield_pk_index = Some(pk_index);
+                self.keys_after_pruning += 1;
+                break;
+            }
+            self.prune_pk_cost += now.elapsed();
+            self.parts_reader.next()?;
+        }
+
+        Ok(())
+    }
+}
+
+impl Drop for ShardReader {
+    fn drop(&mut self) {
+        let shard_prune_pk = self.prune_pk_cost.as_secs_f64();
+        MERGE_TREE_READ_STAGE_ELAPSED
+            .with_label_values(&["shard_prune_pk"])
+            .observe(shard_prune_pk);
+        if self.keys_before_pruning > 0 {
+            common_telemetry::debug!(
+                "ShardReader metrics, data parts: {}, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
+                self.parts_reader.num_parts(),
+                self.keys_before_pruning,
+                self.keys_after_pruning,
+                shard_prune_pk,
+                self.data_build_cost.as_secs_f64(),
+            );
+        }
+    }
 }

 /// A merger that merges batches from multiple shards.
@@ -388,6 +479,7 @@ mod tests {
        shard_id: ShardId,
        metadata: RegionMetadataRef,
        input: &[(KeyValues, PkIndex)],
+        data_freeze_threshold: usize,
    ) -> Shard {
        let mut dict_builder = KeyDictBuilder::new(1024);
        let mut metrics = WriteMetrics::default();
@@ -402,27 +494,17 @@ mod tests {
        let dict = dict_builder.finish(&mut BTreeMap::new()).unwrap();
        let data_parts = DataParts::new(metadata, DATA_INIT_CAP, true);

-        Shard::new(shard_id, Some(Arc::new(dict)), data_parts, true)
+        Shard::new(
+            shard_id,
+            Some(Arc::new(dict)),
+            data_parts,
+            true,
+            data_freeze_threshold,
+        )
    }

-    #[test]
-    fn test_write_read_shard() {
-        let metadata = metadata_for_test();
-        let input = input_with_key(&metadata);
-        let mut shard = new_shard_with_dict(8, metadata, &input);
-        assert!(shard.is_empty());
-        for (key_values, pk_index) in &input {
-            for kv in key_values.iter() {
-                let pk_id = PkId {
-                    shard_id: shard.shard_id,
-                    pk_index: *pk_index,
-                };
-                shard.write_with_pk_id(pk_id, &kv);
-            }
-        }
-        assert!(!shard.is_empty());
-
-        let mut reader = shard.read().unwrap().build().unwrap();
+    fn collect_timestamps(shard: &Shard) -> Vec<i64> {
+        let mut reader = shard.read().unwrap().build(None).unwrap();
        let mut timestamps = Vec::new();
        while reader.is_valid() {
            let rb = reader.current_data_batch().slice_record_batch();
@@ -432,6 +514,64 @@ mod tests {

            reader.next().unwrap();
        }
+        timestamps
+    }
+
+    #[test]
+    fn test_write_read_shard() {
+        let metadata = metadata_for_test();
+        let input = input_with_key(&metadata);
+        let mut shard = new_shard_with_dict(8, metadata, &input, 100);
+        assert!(shard.is_empty());
+        for (key_values, pk_index) in &input {
+            for kv in key_values.iter() {
+                let pk_id = PkId {
+                    shard_id: shard.shard_id,
+                    pk_index: *pk_index,
+                };
+                shard.write_with_pk_id(pk_id, &kv).unwrap();
+            }
+        }
+        assert!(!shard.is_empty());
+
+        let timestamps = collect_timestamps(&shard);
        assert_eq!(vec![0, 1, 10, 11, 20, 21], timestamps);
    }
+
+    #[test]
+    fn test_shard_freeze() {
+        let metadata = metadata_for_test();
+        let kvs = build_key_values_with_ts_seq_values(
+            &metadata,
+            "shard".to_string(),
+            0,
+            [0].into_iter(),
+            [Some(0.0)].into_iter(),
+            0,
+        );
+        let mut shard = new_shard_with_dict(8, metadata.clone(), &[(kvs, 0)], 50);
+        let expected: Vec<_> = (0..200).collect();
+        for i in &expected {
+            let kvs = build_key_values_with_ts_seq_values(
+                &metadata,
+                "shard".to_string(),
+                0,
+                [*i].into_iter(),
+                [Some(0.0)].into_iter(),
+                *i as u64,
+            );
+            let pk_id = PkId {
+                shard_id: shard.shard_id,
+                pk_index: *i as PkIndex,
+            };
+            for kv in kvs.iter() {
+                shard.write_with_pk_id(pk_id, &kv).unwrap();
+            }
+        }
+        assert!(!shard.is_empty());
+        assert_eq!(3, shard.data_parts.frozen_len());
+
+        let timestamps = collect_timestamps(&shard);
+        assert_eq!(expected, timestamps);
+    }
 }
--- a/src/mito2/src/memtable/merge_tree/shard_builder.rs
+++ b/src/mito2/src/memtable/merge_tree/shard_builder.rs
@@ -16,6 +16,7 @@

 use std::collections::{BTreeMap, HashMap};
 use std::sync::Arc;
+use std::time::{Duration, Instant};

 use store_api::metadata::RegionMetadataRef;

@@ -26,8 +27,9 @@ use crate::memtable::merge_tree::data::{
 };
 use crate::memtable::merge_tree::dict::{DictBuilderReader, KeyDictBuilder};
 use crate::memtable::merge_tree::metrics::WriteMetrics;
+use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
 use crate::memtable::merge_tree::shard::Shard;
-use crate::memtable::merge_tree::{MergeTreeConfig, PkId, ShardId};
+use crate::memtable::merge_tree::{MergeTreeConfig, PkId, PkIndex, ShardId};
 use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;

 /// Builder to write keys and data to a shard that the key dictionary
@@ -136,7 +138,13 @@ impl ShardBuilder {
        let shard_id = self.current_shard_id;
        self.current_shard_id += 1;

-        Ok(Some(Shard::new(shard_id, key_dict, data_parts, self.dedup)))
+        Ok(Some(Shard::new(
+            shard_id,
+            key_dict,
+            data_parts,
+            self.dedup,
+            self.data_freeze_threshold,
+        )))
    }

    /// Scans the shard builder.
@@ -176,13 +184,20 @@ pub(crate) struct ShardBuilderReaderBuilder {
 }

 impl ShardBuilderReaderBuilder {
-    pub(crate) fn build(self, pk_weights: Option<&[u16]>) -> Result<ShardBuilderReader> {
+    pub(crate) fn build(
+        self,
+        pk_weights: Option<&[u16]>,
+        key_filter: Option<PrimaryKeyFilter>,
+    ) -> Result<ShardBuilderReader> {
+        let now = Instant::now();
        let data_reader = self.data_reader.build(pk_weights)?;
-        Ok(ShardBuilderReader {
-            shard_id: self.shard_id,
-            dict_reader: self.dict_reader,
+        ShardBuilderReader::new(
+            self.shard_id,
+            self.dict_reader,
            data_reader,
-        })
+            key_filter,
+            now.elapsed(),
+        )
    }
 }

@@ -191,15 +206,45 @@ pub struct ShardBuilderReader {
    shard_id: ShardId,
    dict_reader: DictBuilderReader,
    data_reader: DataBufferReader,
+    key_filter: Option<PrimaryKeyFilter>,
+    last_yield_pk_index: Option<PkIndex>,
+    keys_before_pruning: usize,
+    keys_after_pruning: usize,
+    prune_pk_cost: Duration,
+    data_build_cost: Duration,
 }

 impl ShardBuilderReader {
+    fn new(
+        shard_id: ShardId,
+        dict_reader: DictBuilderReader,
+        data_reader: DataBufferReader,
+        key_filter: Option<PrimaryKeyFilter>,
+        data_build_cost: Duration,
+    ) -> Result<Self> {
+        let mut reader = ShardBuilderReader {
+            shard_id,
+            dict_reader,
+            data_reader,
+            key_filter,
+            last_yield_pk_index: None,
+            keys_before_pruning: 0,
+            keys_after_pruning: 0,
+            prune_pk_cost: Duration::default(),
+            data_build_cost,
+        };
+        reader.prune_batch_by_key()?;
+
+        Ok(reader)
+    }
+
    pub fn is_valid(&self) -> bool {
        self.data_reader.is_valid()
    }

    pub fn next(&mut self) -> Result<()> {
-        self.data_reader.next()
+        self.data_reader.next()?;
+        self.prune_batch_by_key()
    }

    pub fn current_key(&self) -> Option<&[u8]> {
@@ -218,6 +263,52 @@ impl ShardBuilderReader {
    pub fn current_data_batch(&self) -> DataBatch {
        self.data_reader.current_data_batch()
    }
+
+    fn prune_batch_by_key(&mut self) -> Result<()> {
+        let Some(key_filter) = &mut self.key_filter else {
+            return Ok(());
+        };
+
+        while self.data_reader.is_valid() {
+            let pk_index = self.data_reader.current_data_batch().pk_index();
+            if let Some(yield_pk_index) = self.last_yield_pk_index {
+                if pk_index == yield_pk_index {
+                    break;
+                }
+            }
+            self.keys_before_pruning += 1;
+            let key = self.dict_reader.key_by_pk_index(pk_index);
+            let now = Instant::now();
+            if key_filter.prune_primary_key(key) {
+                self.prune_pk_cost += now.elapsed();
+                self.last_yield_pk_index = Some(pk_index);
+                self.keys_after_pruning += 1;
+                break;
+            }
+            self.prune_pk_cost += now.elapsed();
+            self.data_reader.next()?;
+        }
+
+        Ok(())
+    }
+}
+
+impl Drop for ShardBuilderReader {
+    fn drop(&mut self) {
+        let shard_builder_prune_pk = self.prune_pk_cost.as_secs_f64();
+        MERGE_TREE_READ_STAGE_ELAPSED
+            .with_label_values(&["shard_builder_prune_pk"])
+            .observe(shard_builder_prune_pk);
+        if self.keys_before_pruning > 0 {
+            common_telemetry::debug!(
+                "ShardBuilderReader metrics, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
+                self.keys_before_pruning,
+                self.keys_after_pruning,
+                shard_builder_prune_pk,
+                self.data_build_cost.as_secs_f64(),
+            );
+        }
+    }
 }

 #[cfg(test)]
@@ -306,7 +397,7 @@ mod tests {
        let mut reader = shard_builder
            .read(&mut pk_weights)
            .unwrap()
-            .build(Some(&pk_weights))
+            .build(Some(&pk_weights), None)
            .unwrap();
        let mut timestamps = Vec::new();
        while reader.is_valid() {
--- a/src/mito2/src/memtable/merge_tree/tree.rs
+++ b/src/mito2/src/memtable/merge_tree/tree.rs
@@ -39,7 +39,7 @@ use crate::memtable::merge_tree::partition::{
 };
 use crate::memtable::merge_tree::MergeTreeConfig;
 use crate::memtable::{BoxedBatchIterator, KeyValues};
-use crate::metrics::{MERGE_TREE_READ_STAGE_ELAPSED, READ_STAGE_ELAPSED};
+use crate::metrics::{MERGE_TREE_READ_STAGE_ELAPSED, READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
 use crate::read::Batch;
 use crate::row_converter::{McmpRowCodec, RowCodec, SortField};

@@ -124,7 +124,7 @@ impl MergeTree {

            if !has_pk {
                // No primary key.
-                self.write_no_key(kv);
+                self.write_no_key(kv)?;
                continue;
            }

@@ -299,7 +299,7 @@ impl MergeTree {
        )
    }

-    fn write_no_key(&self, key_value: KeyValue) {
+    fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
        let partition_key = Partition::get_partition_key(&key_value, self.is_partitioned);
        let partition = self.get_or_create_partition(partition_key);

@@ -397,6 +397,9 @@ struct TreeIter {

 impl Drop for TreeIter {
    fn drop(&mut self) {
+        READ_ROWS_TOTAL
+            .with_label_values(&["merge_tree_memtable"])
+            .inc_by(self.metrics.rows_fetched as u64);
        MERGE_TREE_READ_STAGE_ELAPSED
            .with_label_values(&["fetch_next_partition"])
            .observe(self.metrics.fetch_partition_elapsed.as_secs_f64());
--- a/src/mito2/src/metrics.rs
+++ b/src/mito2/src/metrics.rs
@@ -123,7 +123,7 @@ lazy_static! {
        vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
    )
    .unwrap();
-    /// Counter of rows read.
+    /// Counter of rows read from different source.
    pub static ref READ_ROWS_TOTAL: IntCounterVec =
        register_int_counter_vec!("greptime_mito_read_rows_total", "mito read rows total", &[TYPE_LABEL]).unwrap();
    /// Counter of filtered rows during merge.
@@ -137,6 +137,24 @@ lazy_static! {
        register_int_counter_vec!("greptime_mito_precise_filter_rows_total", "mito precise filter rows total", &[TYPE_LABEL]).unwrap();
    pub static ref READ_ROWS_IN_ROW_GROUP_TOTAL: IntCounterVec =
        register_int_counter_vec!("greptime_mito_read_rows_in_row_group_total", "mito read rows in row group total", &[TYPE_LABEL]).unwrap();
+    /// Histogram for the number of SSTs to scan per query.
+    pub static ref READ_SST_COUNT: Histogram = register_histogram!(
+        "greptime_mito_read_sst_count",
+        "Number of SSTs to scan in a scan task",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 256.0, 1024.0],
+    ).unwrap();
+    /// Histogram for the number of rows returned per query.
+    pub static ref READ_ROWS_RETURN: Histogram = register_histogram!(
+        "greptime_mito_read_rows_return",
+        "Number of rows returned in a scan task",
+        exponential_buckets(100.0, 10.0, 8).unwrap(),
+    ).unwrap();
+    /// Histogram for the number of batches returned per query.
+    pub static ref READ_BATCHES_RETURN: Histogram = register_histogram!(
+        "greptime_mito_read_batches_return",
+        "Number of rows returned in a scan task",
+        exponential_buckets(100.0, 10.0, 7).unwrap(),
+    ).unwrap();
    // ------- End of query metrics.

    // Cache related metrics.
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -15,6 +15,7 @@
 //! Scans a region according to the scan request.

 use std::sync::Arc;
+use std::time::Instant;

 use common_recordbatch::SendableRecordBatchStream;
 use common_telemetry::{debug, warn};
@@ -124,6 +125,8 @@ pub(crate) struct ScanRegion {
    parallelism: ScanParallism,
    /// Whether to ignore inverted index.
    ignore_inverted_index: bool,
+    /// Start time of the scan task.
+    start_time: Option<Instant>,
 }

 impl ScanRegion {
@@ -141,6 +144,7 @@ impl ScanRegion {
            cache_manager,
            parallelism: ScanParallism::default(),
            ignore_inverted_index: false,
+            start_time: None,
        }
    }

@@ -152,11 +156,17 @@ impl ScanRegion {
    }

    #[must_use]
-    pub(crate) fn ignore_inverted_index(mut self, ignore: bool) -> Self {
+    pub(crate) fn with_ignore_inverted_index(mut self, ignore: bool) -> Self {
        self.ignore_inverted_index = ignore;
        self
    }

+    #[must_use]
+    pub(crate) fn with_start_time(mut self, now: Instant) -> Self {
+        self.start_time = Some(now);
+        self
+    }
+
    /// Returns a [Scanner] to scan the region.
    pub(crate) fn scanner(self) -> Result<Scanner> {
        self.seq_scan().map(Scanner::Seq)
@@ -223,7 +233,8 @@ impl ScanRegion {
            .with_files(files)
            .with_cache(self.cache_manager)
            .with_index_applier(index_applier)
-            .with_parallelism(self.parallelism);
+            .with_parallelism(self.parallelism)
+            .with_start_time(self.start_time);

        Ok(seq_scan)
    }
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -32,7 +32,7 @@ use crate::access_layer::AccessLayerRef;
 use crate::cache::{CacheManager, CacheManagerRef};
 use crate::error::Result;
 use crate::memtable::MemtableRef;
-use crate::metrics::READ_STAGE_ELAPSED;
+use crate::metrics::{READ_BATCHES_RETURN, READ_ROWS_RETURN, READ_SST_COUNT, READ_STAGE_ELAPSED};
 use crate::read::compat::{self, CompatReader};
 use crate::read::merge::MergeReaderBuilder;
 use crate::read::projection::ProjectionMapper;
@@ -65,6 +65,8 @@ pub struct SeqScan {
    parallelism: ScanParallism,
    /// Index applier.
    index_applier: Option<SstIndexApplierRef>,
+    /// Start time of the query.
+    query_start: Option<Instant>,
 }

 impl SeqScan {
@@ -82,6 +84,7 @@ impl SeqScan {
            ignore_file_not_found: false,
            parallelism: ScanParallism::default(),
            index_applier: None,
+            query_start: None,
        }
    }

@@ -141,10 +144,19 @@ impl SeqScan {
        self
    }

+    /// Sets start time of the query.
+    #[must_use]
+    pub(crate) fn with_start_time(mut self, now: Option<Instant>) -> Self {
+        self.query_start = now;
+        self
+    }
+
    /// Builds a stream for the query.
    pub async fn build_stream(&self) -> Result<SendableRecordBatchStream> {
-        let start = Instant::now();
        let mut metrics = Metrics::default();
+        let build_start = Instant::now();
+        let query_start = self.query_start.unwrap_or(build_start);
+        metrics.prepare_scan_cost = query_start.elapsed();
        let use_parallel = self.use_parallel_reader();
        // Scans all memtables and SSTs. Builds a merge reader to merge results.
        let mut reader = if use_parallel {
@@ -152,9 +164,13 @@ impl SeqScan {
        } else {
            self.build_reader().await?
        };
-        let elapsed = start.elapsed();
-        metrics.build_reader_cost = elapsed;
-        metrics.scan_cost = elapsed;
+        metrics.build_reader_cost = build_start.elapsed();
+        READ_STAGE_ELAPSED
+            .with_label_values(&["prepare_scan"])
+            .observe(metrics.prepare_scan_cost.as_secs_f64());
+        READ_STAGE_ELAPSED
+            .with_label_values(&["build_reader"])
+            .observe(metrics.build_reader_cost.as_secs_f64());

        // Creates a stream to poll the batch reader and convert batch into record batch.
        let mapper = self.mapper.clone();
@@ -165,15 +181,22 @@ impl SeqScan {
            while let Some(batch) =
                Self::fetch_record_batch(&mut reader, &mapper, cache, &mut metrics).await?
            {
+                metrics.num_batches += 1;
+                metrics.num_rows += batch.num_rows();
                yield batch;
            }

+            // Update metrics.
+            metrics.total_cost = query_start.elapsed();
+            READ_STAGE_ELAPSED.with_label_values(&["convert_rb"]).observe(metrics.convert_cost.as_secs_f64());
+            READ_STAGE_ELAPSED.with_label_values(&["scan"]).observe(metrics.scan_cost.as_secs_f64());
+            READ_STAGE_ELAPSED.with_label_values(&["total"]).observe(metrics.total_cost.as_secs_f64());
+            READ_ROWS_RETURN.observe(metrics.num_rows as f64);
+            READ_BATCHES_RETURN.observe(metrics.num_batches as f64);
            debug!(
                "Seq scan finished, region_id: {:?}, metrics: {:?}, use_parallel: {}, parallelism: {}",
                mapper.metadata().region_id, metrics, use_parallel, parallelism,
            );
-            // Update metrics.
-            READ_STAGE_ELAPSED.with_label_values(&["total"]).observe(metrics.scan_cost.as_secs_f64());
        };
        let stream = Box::pin(RecordBatchStreamWrapper::new(
            self.mapper.output_schema(),
@@ -249,6 +272,8 @@ impl SeqScan {
            }
        }

+        READ_SST_COUNT.observe(self.files.len() as f64);
+
        Ok(sources)
    }

@@ -318,12 +343,20 @@ impl SeqScan {
 /// Metrics for [SeqScan].
 #[derive(Debug, Default)]
 struct Metrics {
+    /// Duration to prepare the scan task.
+    prepare_scan_cost: Duration,
    /// Duration to build the reader.
    build_reader_cost: Duration,
    /// Duration to scan data.
    scan_cost: Duration,
    /// Duration to convert batches.
    convert_cost: Duration,
+    /// Duration of the scan.
+    total_cost: Duration,
+    /// Number of batches returned.
+    num_batches: usize,
+    /// Number of rows returned.
+    num_rows: usize,
 }

 #[cfg(test)]
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -171,6 +171,8 @@ impl RegionOpener {
        // Initial memtable id is 0.
        let mutable = self.memtable_builder.build(0, &metadata);

+        debug!("Create region {} with options: {:?}", region_id, options);
+
        let version = VersionBuilder::new(metadata, mutable)
            .options(options)
            .build();
@@ -249,6 +251,9 @@ impl RegionOpener {

        let region_id = self.region_id;
        let object_store = self.object_store(&region_options.storage)?.clone();
+
+        debug!("Open region {} with options: {:?}", region_id, self.options);
+
        let access_layer = Arc::new(AccessLayer::new(
            self.region_dir.clone(),
            object_store,
--- a/src/mito2/src/region/options.rs
+++ b/src/mito2/src/region/options.rs
@@ -13,6 +13,8 @@
 // limitations under the License.

 //! Options for a region.
+//!
+//! If we add options in this mod, we also need to modify [store_api::mito_engine_options].

 use std::collections::HashMap;
 use std::time::Duration;
@@ -358,6 +360,7 @@ mod tests {
            ("compaction.type", "twcs"),
            ("storage", "S3"),
            ("index.inverted_index.ignore_column_ids", "1,2,3"),
+            ("index.inverted_index.segment_row_count", "512"),
            (
                WAL_OPTIONS_KEY,
                &serde_json::to_string(&wal_options).unwrap(),
@@ -376,7 +379,7 @@ mod tests {
            index_options: IndexOptions {
                inverted_index: InvertedIndexOptions {
                    ignore_column_ids: vec![1, 2, 3],
-                    segment_row_count: 1024,
+                    segment_row_count: 512,
                },
            },
        };
--- a/src/mito2/src/row_converter.rs
+++ b/src/mito2/src/row_converter.rs
@@ -215,6 +215,61 @@ impl SortField {
            Decimal128, Decimal128
        )
    }
+
+    /// Skip deserializing this field, returns the length of it.
+    fn skip_deserialize(
+        &self,
+        bytes: &[u8],
+        deserializer: &mut Deserializer<&[u8]>,
+    ) -> Result<usize> {
+        let pos = deserializer.position();
+        if bytes[pos] == 0 {
+            deserializer.advance(1);
+            return Ok(1);
+        }
+
+        let to_skip = match &self.data_type {
+            ConcreteDataType::Boolean(_) => 2,
+            ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
+            ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
+            ConcreteDataType::Int32(_) | ConcreteDataType::UInt32(_) => 5,
+            ConcreteDataType::Int64(_) | ConcreteDataType::UInt64(_) => 9,
+            ConcreteDataType::Float32(_) => 5,
+            ConcreteDataType::Float64(_) => 9,
+            ConcreteDataType::Binary(_) => {
+                // Now the encoder encode binary as a list of bytes so we can't use
+                // skip bytes.
+                let pos_before = deserializer.position();
+                let mut current = pos_before + 1;
+                while bytes[current] == 1 {
+                    current += 2;
+                }
+                let to_skip = current - pos_before + 1;
+                deserializer.advance(to_skip);
+                return Ok(to_skip);
+            }
+            ConcreteDataType::String(_) => {
+                let pos_before = deserializer.position();
+                deserializer.advance(1);
+                deserializer
+                    .skip_bytes()
+                    .context(error::DeserializeFieldSnafu)?;
+                return Ok(deserializer.position() - pos_before);
+            }
+            ConcreteDataType::Date(_) => 5,
+            ConcreteDataType::DateTime(_) => 9,
+            ConcreteDataType::Timestamp(_) => 9, // We treat timestamp as Option<i64>
+            ConcreteDataType::Time(_) => 10,     // i64 and 1 byte time unit
+            ConcreteDataType::Duration(_) => 10,
+            ConcreteDataType::Interval(_) => 18,
+            ConcreteDataType::Decimal128(_) => 19,
+            ConcreteDataType::Null(_)
+            | ConcreteDataType::List(_)
+            | ConcreteDataType::Dictionary(_) => 0,
+        };
+        deserializer.advance(to_skip);
+        Ok(to_skip)
+    }
 }

 /// A memory-comparable row [Value] encoder/decoder.
@@ -236,6 +291,52 @@ impl McmpRowCodec {
    pub fn estimated_size(&self) -> usize {
        self.fields.iter().map(|f| f.estimated_size()).sum()
    }
+
+    /// Decode value at `pos` in `bytes`.
+    ///
+    /// The i-th element in offsets buffer is how many bytes to skip in order to read value at `pos`.
+    pub fn decode_value_at(
+        &self,
+        bytes: &[u8],
+        pos: usize,
+        offsets_buf: &mut Vec<usize>,
+    ) -> Result<Value> {
+        let mut deserializer = Deserializer::new(bytes);
+        if pos < offsets_buf.len() {
+            // We computed the offset before.
+            let to_skip = offsets_buf[pos];
+            deserializer.advance(to_skip);
+            return self.fields[pos].deserialize(&mut deserializer);
+        }
+
+        if offsets_buf.is_empty() {
+            let mut offset = 0;
+            // Skip values before `pos`.
+            for i in 0..pos {
+                // Offset to skip before reading value i.
+                offsets_buf.push(offset);
+                let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
+                offset += skip;
+            }
+            // Offset to skip before reading this value.
+            offsets_buf.push(offset);
+        } else {
+            // Offsets are not enough.
+            let value_start = offsets_buf.len() - 1;
+            // Advances to decode value at `value_start`.
+            let mut offset = offsets_buf[value_start];
+            deserializer.advance(offset);
+            for i in value_start..pos {
+                // Skip value i.
+                let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
+                // Offset for the value at i + 1.
+                offset += skip;
+                offsets_buf.push(offset);
+            }
+        }
+
+        self.fields[pos].deserialize(&mut deserializer)
+    }
 }

 impl RowCodec for McmpRowCodec {
@@ -274,7 +375,7 @@ impl RowCodec for McmpRowCodec {
 #[cfg(test)]
 mod tests {
    use common_base::bytes::StringBytes;
-    use common_time::Timestamp;
+    use common_time::{DateTime, Timestamp};
    use datatypes::value::Value;

    use super::*;
@@ -292,6 +393,18 @@ mod tests {
        let result = encoder.encode(value_ref.iter().cloned()).unwrap();
        let decoded = encoder.decode(&result).unwrap();
        assert_eq!(decoded, row);
+        let mut decoded = Vec::new();
+        let mut offsets = Vec::new();
+        // Iter two times to test offsets buffer.
+        for _ in 0..2 {
+            decoded.clear();
+            for i in 0..data_types.len() {
+                let value = encoder.decode_value_at(&result, i, &mut offsets).unwrap();
+                decoded.push(value);
+            }
+            assert_eq!(data_types.len(), offsets.len(), "offsets: {:?}", offsets);
+            assert_eq!(decoded, row);
+        }
    }

    #[test]
@@ -416,5 +529,53 @@ mod tests {
            ],
            vec![Value::Null, Value::Int64(43), Value::Boolean(true)],
        );
+
+        // All types.
+        check_encode_and_decode(
+            &[
+                ConcreteDataType::boolean_datatype(),
+                ConcreteDataType::int8_datatype(),
+                ConcreteDataType::uint8_datatype(),
+                ConcreteDataType::int16_datatype(),
+                ConcreteDataType::uint16_datatype(),
+                ConcreteDataType::int32_datatype(),
+                ConcreteDataType::uint32_datatype(),
+                ConcreteDataType::int64_datatype(),
+                ConcreteDataType::uint64_datatype(),
+                ConcreteDataType::float32_datatype(),
+                ConcreteDataType::float64_datatype(),
+                ConcreteDataType::binary_datatype(),
+                ConcreteDataType::string_datatype(),
+                ConcreteDataType::date_datatype(),
+                ConcreteDataType::datetime_datatype(),
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                ConcreteDataType::time_millisecond_datatype(),
+                ConcreteDataType::duration_millisecond_datatype(),
+                ConcreteDataType::interval_month_day_nano_datatype(),
+                ConcreteDataType::decimal128_default_datatype(),
+            ],
+            vec![
+                Value::Boolean(true),
+                Value::Int8(8),
+                Value::UInt8(8),
+                Value::Int16(16),
+                Value::UInt16(16),
+                Value::Int32(32),
+                Value::UInt32(32),
+                Value::Int64(64),
+                Value::UInt64(64),
+                Value::Float32(1.0.into()),
+                Value::Float64(1.0.into()),
+                Value::Binary(b"hello"[..].into()),
+                Value::String("world".into()),
+                Value::Date(Date::new(10)),
+                Value::DateTime(DateTime::new(11)),
+                Value::Timestamp(Timestamp::new_millisecond(12)),
+                Value::Time(Time::new_millisecond(13)),
+                Value::Duration(Duration::new_millisecond(14)),
+                Value::Interval(Interval::from_month_day_nano(1, 1, 15)),
+                Value::Decimal128(Decimal128::from(16)),
+            ],
+        );
    }
 }
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -219,25 +219,14 @@ pub(crate) fn extract_data_batch(batch: &DataBatch) -> (u16, Vec<(i64, u64)>) {

 /// Builds key values with timestamps (ms) and sequences for test.
 pub(crate) fn build_key_values_with_ts_seq_values(
-    schema: &RegionMetadataRef,
+    metadata: &RegionMetadataRef,
    k0: String,
    k1: u32,
    timestamps: impl Iterator<Item = i64>,
    values: impl Iterator<Item = Option<f64>>,
    sequence: SequenceNumber,
 ) -> KeyValues {
-    let column_schema = schema
-        .column_metadatas
-        .iter()
-        .map(|c| api::v1::ColumnSchema {
-            column_name: c.column_schema.name.clone(),
-            datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
-                .unwrap()
-                .datatype() as i32,
-            semantic_type: c.semantic_type as i32,
-            ..Default::default()
-        })
-        .collect();
+    let column_schema = region_metadata_to_row_schema(metadata);

    let rows = timestamps
        .zip(values)
@@ -269,7 +258,23 @@ pub(crate) fn build_key_values_with_ts_seq_values(
            rows,
        }),
    };
-    KeyValues::new(schema.as_ref(), mutation).unwrap()
+    KeyValues::new(metadata.as_ref(), mutation).unwrap()
+}
+
+/// Converts the region metadata to column schemas for a row.
+pub fn region_metadata_to_row_schema(metadata: &RegionMetadataRef) -> Vec<api::v1::ColumnSchema> {
+    metadata
+        .column_metadatas
+        .iter()
+        .map(|c| api::v1::ColumnSchema {
+            column_name: c.column_schema.name.clone(),
+            datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
+                .unwrap()
+                .datatype() as i32,
+            semantic_type: c.semantic_type as i32,
+            ..Default::default()
+        })
+        .collect()
 }

 /// Encode keys.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ruihang Xia	038bc4fe6e	revert toml format Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-14 00:46:00 +08:00
Ruihang Xia	6d07c422d8	Merge branch 'main' into fix-proto-clear Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-14 00:36:28 +08:00
Ruihang Xia	6c14ece23f	accomplish test assertion Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-14 00:32:49 +08:00
Ruihang Xia	89c51d9b87	reset Sample Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-13 23:32:22 +08:00
Weny Xu	e4333969b4	feat(fuzz): add alter table target (#3503 ) * feat(fuzz): validate semantic type of column * feat(fuzz): add fuzz_alter_table target * feat(fuzz): validate columns * chore(ci): add fuzz_alter_table ci cfg	2024-03-13 14:11:47 +00:00
Zhenchi	b55905cf66	feat(fuzz): add insert target (#3499 ) * fix(common-time): allow building nanos timestamp from parts split from i64::MIN Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * feat(fuzz): add insert target Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * chore: cleanup cargo.toml and polish comments Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2024-03-13 10:03:03 +00:00
WU Jingdi	fb4da05f25	fix: adjust fill behavior of range query (#3489 )	2024-03-13 09:20:34 +00:00
Zhenchi	904484b525	fix(common-time): allow building nanos timestamp from parts split from i64::MIN (#3493 ) Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2024-03-13 02:46:00 +00:00
tison	cafb4708ce	refactor: validate constraints eagerly (#3472 ) * chore: validate constraints eagerly Signed-off-by: tison <wander4096@gmail.com> * use timestamp column Signed-off-by: tison <wander4096@gmail.com> * fixup Signed-off-by: tison <wander4096@gmail.com> * lint Signed-off-by: tison <wander4096@gmail.com> * compile Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com>	2024-03-12 13:09:34 +00:00
Yingwen	7c895e2605	perf: more benchmarks for memtables (#3491 ) * chore: remove duplicate bench * refactor: rename bench * perf: add full scan bench for memtable * feat: filter bench and add time series to bench group * chore: comment * refactor: rename * style: fix clippy	2024-03-12 12:02:58 +00:00
Lei, HUANG	9afe327bca	feat: improve prom write requests decode performance (#3478 ) * feat: optimize decode performance * fix: some cr comments	2024-03-12 12:00:38 +00:00
discord9	58bd065c6b	feat(flow): plan def (#3490 ) * feat: plan def * chore: add license * docs: remove TODO done * chore: add derive Ord	2024-03-12 10:59:07 +00:00
Yingwen	9aa8f756ab	fix: allow passing extra table options (#3484 ) * fix: do not check options in parser * test: fix tests * test: fix sqlness * test: add sqlness test * chore: log options * chore: must specify compaction type * feat: validate option key * feat: add option key validation back	2024-03-12 07:03:52 +00:00
discord9	7639c227ca	feat(flow): accumlator for aggr func (#3396 ) * feat: Accumlator trait * feat: add `OrdValue` accum&use enum_dispatch * test: more accum test * feat: eval aggr funcs * chore: refactor test&fmt clippy * refactor: less verbose * test: more tests * refactor: better err handling&use OrdValue for Count * refactor: ignore null&more tests for error handle * refactor: OrdValue accum * chore: extract null check * refactor: def&use fn signature * chore: use extra cond with match guard * chore: per review	2024-03-12 02:09:27 +00:00
tison	1255c1fc9e	feat: to_timezone function (#3470 ) * feat: to_timezone function Signed-off-by: tison <wander4096@gmail.com> * impl Function for ToTimezoneFunction Signed-off-by: tison <wander4096@gmail.com> * add test Signed-off-by: tison <wander4096@gmail.com> * Add original authors Co-authored-by: parkma99 <park-ma@hotmail.com> Co-authored-by: Yingwen <realevenyag@gmail.com> * fixup Signed-off-by: tison <wander4096@gmail.com> * address comments Signed-off-by: tison <wander4096@gmail.com> * add issue link Signed-off-by: tison <wander4096@gmail.com> * code refactor Signed-off-by: tison <wander4096@gmail.com> * further tidy Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: parkma99 <park-ma@hotmail.com> Co-authored-by: Yingwen <realevenyag@gmail.com>	2024-03-12 01:46:19 +00:00
Yingwen	06dcd0f6ed	fix: freeze data buffer in shard (#3468 ) * feat: call freeze if the active data buffer in a shard is full * chore: more metrics * chore: print metrics * chore: enlarge freeze threshold * test: test freeze * test: fix config test	2024-03-11 14:51:06 +00:00
Weny Xu	0a4444a43a	feat(fuzz): validate columns (#3485 )	2024-03-11 11:34:50 +00:00
Ruihang Xia	b7ac8d6aa8	ci: use another mirror for etcd image (#3486 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-11 10:40:19 +00:00
Weny Xu	e767f37241	fix: fix f64 has no sufficient precision during parsing (#3483 )	2024-03-11 09:28:40 +00:00
JeremyHi	da098f5568	fix: make max-txn-ops limit valid (#3481 )	2024-03-11 09:27:51 +00:00
shuiyisong	aa953dcc34	fix: impl `RecordBatchStream` method explicitly (#3482 ) fix: impl RecordBatchStream method explicitly	2024-03-11 09:07:10 +00:00
crwen	aa125a50f9	refactor: make http api returns non-200 status code (#3473 ) * refactor: make http api returns non-200 status code * recover some code	2024-03-11 03:38:36 +00:00
Ruihang Xia	d8939eb891	feat: clamp function (#3465 ) * basic impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add unit tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * a little type exercise Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add sqlness case Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-11 03:26:10 +00:00
shuiyisong	0bb949787c	refactor: introduce new `Output` with `OutputMeta` (#3466 ) * refactor: introduce new output struct * chore: add helper function * chore: update comment * chore: update commit Co-authored-by: Ruihang Xia <waynestxia@gmail.com> * chore: rename according to cr --------- Co-authored-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-11 02:24:09 +00:00
WU Jingdi	8c37c3fc0f	feat: support `first_value/last_value` in range query (#3448 ) * feat: support `first_value/last_value` in range query * chore: add sqlness test on `count` * chore: add test	2024-03-11 01:30:39 +00:00
gcmutator	21ff3620be	chore: remove repetitive words (#3469 ) remove repetitive words Signed-off-by: gcmutator <329964069@qq.com>	2024-03-09 04:18:47 +00:00
Eugene Tolbakov	aeca0d8e8a	feat(influxdb): add db query param support for v2 write api (#3445 ) * feat(influxdb): add db query param support for v2 write api * fix(influxdb): update authorize logic to get catalog and schema from query string * fix(influxdb): address CR suggestions * fix(influxdb): use the correct import	2024-03-08 08:17:57 +00:00
Weny Xu	a309cd018a	fix: fix incorrect `COM_STMT_PREPARE` reply (#3463 ) * fix: fix incorrect `COM_STMT_PREPARE` reply * chore: use column name instead of index	2024-03-08 07:31:20 +00:00
Yingwen	3ee53360ee	perf: Reduce decode overhead during pruning keys in the memtable (#3415 ) * feat: reuse value buf * feat: skip values to decode * feat: prune shard chore: fix compiler errors refactor: shard prune metrics * fix: panic on DedupReader::try_new * fix: prune after next * chore: num parts metrics * feat: metrics and logs * chore: data build cost * chore: more logs * feat: cache skip result * chore: todo * fix: index out of bound * test: test codec * fix: invalid offsets * fix: skip binary * fix: offset buffer reuse * chore: comment * test: test memtable filter * style: fix clippy * chore: fix compiler error	2024-03-08 02:54:00 +00:00
JeremyHi	352bd7b6fd	feat: max-txn-ops option (#3458 ) * feat: max-txn-ops limit * chore: by comment	2024-03-08 02:34:40 +00:00
Weny Xu	3f3ef2e7af	refactor: separate the quote char and value (#3455 ) refactor: use ident instead of string	2024-03-07 08:24:09 +00:00
Weny Xu	a218f12bd9	test: add fuzz test for create table (#3441 ) * feat: add create table fuzz test * chore: add ci cfg for fuzz tests * refactor: remove redundant nightly config * chore: run fuzz test in debug mode * chore: use ubuntu-latest * fix: close connection * chore: add cache in fuzz test ci * chore: apply suggestion from CR * chore: apply suggestion from CR * chore: refactor the fuzz test action	2024-03-07 06:51:19 +00:00
ZonaHe	c884c56151	feat: update dashboard to v0.4.8 (#3450 ) Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>	2024-03-07 04:06:07 +00:00
Weny Xu	9ec288cab9	chore: specify binary name (#3449 )	2024-03-07 03:56:24 +00:00
LFC	1f1491e429	feat: impl some "set"s to adapt to some client apps (#3443 )	2024-03-06 13:15:48 +00:00
Weny Xu	c52bc613e0	chore: add bin opt to build cmd (#3440 )	2024-03-06 08:24:55 +00:00
shuiyisong	a9d42f7b87	fix: add support for influxdb basic auth (#3437 )	2024-03-06 03:56:25 +00:00
tison	86ce2d8713	build(deps): upgrade opendal to 0.45.1 (#3432 ) * build(deps): upgrade opendal to 0.45.1 Signed-off-by: tison <wander4096@gmail.com> * Update src/object-store/Cargo.toml Co-authored-by: Weny Xu <wenymedia@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: Weny Xu <wenymedia@gmail.com>	2024-03-06 03:08:59 +00:00
Yingwen	5d644c0b7f	chore: bump version to v0.7.0 (#3433 )	2024-03-05 12:07:37 +00:00
Ruihang Xia	020635063c	feat: implement multi-dim partition rule (#3409 ) * generate expr rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement show create for new partition rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement row spliter Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: fix failed tests Signed-off-by: WenyXu <wenymedia@gmail.com> * chore: fix lint issues Signed-off-by: WenyXu <wenymedia@gmail.com> * chore: ignore tests for deprecated partition rule * chore: remove unused partition rule tests setup * test(sqlness): add basic partition tests * test(multi_dim): add basic find region test * address CR comments Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Signed-off-by: WenyXu <wenymedia@gmail.com> Co-authored-by: WenyXu <wenymedia@gmail.com>	2024-03-05 11:39:15 +00:00
dependabot[bot]	97cbfcfe23	build(deps): bump mio from 0.8.10 to 0.8.11 (#3434 ) Bumps [mio](https://github.com/tokio-rs/mio) from 0.8.10 to 0.8.11. - [Release notes](https://github.com/tokio-rs/mio/releases) - [Changelog](https://github.com/tokio-rs/mio/blob/master/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/mio/compare/v0.8.10...v0.8.11) --- updated-dependencies: - dependency-name: mio dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-03-05 11:04:14 +00:00
Lei, HUANG	7183fa198c	refactor: make MergeTreeMemtable the default choice (#3430 ) * refactor: make MergeTreeMemtable the default choice * refactor: reformat * chore: add doc to config	2024-03-05 10:00:08 +00:00
Lei, HUANG	02b18fbca1	feat: decode prom requests to grpc (#3425 ) * hack: inline decode * move to servers * fix: samples lost * add bench * remove useless functions * wip * feat: remove object pools * fix: minor issues * fix: remove useless dep * chore: rebase main * format * finish * fix: format * feat: introduce request pool * try to fix license issue * fix: clippy * resolve comments * fix:typo * remove useless comments	2024-03-05 09:47:32 +00:00
shuiyisong	7b1c3503d0	fix: complete interceptors for all frontend entry (#3428 )	2024-03-05 09:38:47 +00:00
liyang	6fd2ff49d5	ci: refine windows output env (#3431 )	2024-03-05 08:38:28 +00:00
WU Jingdi	53f2a5846c	feat: support tracing rule sampler (#3405 ) * feat: support tracing rule sampler * chore: simplify code	2024-03-05 15:40:02 +08:00
Yingwen	49157868f9	feat: Correct server metrics and add more metrics for scan (#3426 ) * feat: drop timer on stream terminated * refactor: combine metrics into a histogram vec * refactor: frontend grpc metrics * feat: add metrics middleware layer to grpc server * refactor: move http metrics layer to metrics mod * feat: bucket for grpc/http elapsed * feat: remove duplicate metrics * style: fix cilppy * fix: incorrect bucket of promql series * feat: more metrics for mito * feat: convert cost * test: fix metrics test	2024-03-04 10:15:10 +00:00
Ruihang Xia	ae2c18e1cf	docs(rfcs): multi-dimension partition rule (#3350 ) * docs(rfcs): multi-dimension partition rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * change math block type Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix typo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update tracking issue Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update discussion Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix typo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-04 08:10:54 +00:00
dennis zhuang	e6819412c5	refactor: show tables and show databases (#3423 ) * refactor: show tables and show databases * chore: clean code	2024-03-04 06:15:17 +00:00