chore: upgrade 0.4.2 (#2644 )

fix: predicate shall use real schema to create physical exprs (#2642 )
* fix: prune predicate show use real schema to create physical exprs * refactor: remove redundant results * fix: unit tests * test: add more sqlness cases * test: add more sqlness cases * fix: sqlness orderby * chore: update log * fix: cache physical expr in memtable iter --------- Co-authored-by: Yingwen <realevenyag@gmail.com>
2025-12-26 16:10:02 +00:00 · 2023-10-24 12:21:58 +08:00 · 2023-10-24 03:41:25 +00:00 · 2023-10-23 10:32:51 +00:00 · 2023-10-23 08:59:00 +00:00 · 2023-10-23 06:37:43 +00:00
117 changed files with 6438 additions and 2007 deletions
--- a/.github/workflows/dev-build.yml
+++ b/.github/workflows/dev-build.yml
@@ -248,7 +248,7 @@ jobs:
        with:
          src-image-registry: docker.io
          src-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
-          src-image-name: greptimedb
+          src-image-name: ${{ env.IMAGE_NAME }}
          dst-image-registry-username: ${{ secrets.ALICLOUD_USERNAME }}
          dst-image-registry-password: ${{ secrets.ALICLOUD_PASSWORD }}
          dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
--- a/.github/workflows/nightly-ci.yml
+++ b/.github/workflows/nightly-ci.yml
@@ -34,6 +34,14 @@ jobs:
        uses: Swatinem/rust-cache@v2
      - name: Run sqlness
        run: cargo sqlness
+      - name: Notify slack if failed
+        if: failure()
+        uses: slackapi/slack-github-action@v1.23.0
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }}
+        with:
+          payload: |
+            {"text": "Nightly CI failed for sqlness tests"}
      - name: Upload sqlness logs
        if: always()
        uses: actions/upload-artifact@v3
@@ -80,3 +88,11 @@ jobs:
          GT_S3_ACCESS_KEY: ${{ secrets.S3_ACCESS_KEY }}
          GT_S3_REGION: ${{ secrets.S3_REGION }}
          UNITTEST_LOG_DIR: "__unittest_logs"
+      - name: Notify slack if failed
+        if: failure()
+        uses: slackapi/slack-github-action@v1.23.0
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }}
+        with:
+          payload: |
+            {"text": "Nightly CI failed for cargo test"}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -302,8 +302,12 @@ jobs:
  release-cn-artifacts:
    name: Release artifacts to CN region
    if: ${{ inputs.release_images || github.event_name == 'push' || github.event_name == 'schedule' }}
-    needs: [
+    needs: [ # The job have to wait for all the artifacts are built.
      allocate-runners,
+      build-linux-amd64-artifacts,
+      build-linux-arm64-artifacts,
+      build-macos-artifacts,
+      build-windows-artifacts,
      release-images-to-dockerhub,
    ]
    runs-on: ubuntu-20.04
@@ -338,11 +342,12 @@ jobs:
  publish-github-release:
    name: Create GitHub release and upload artifacts
    if: ${{ inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule' }}
-    needs: [
+    needs: [ # The job have to wait for all the artifacts are built.
      allocate-runners,
      build-linux-amd64-artifacts,
      build-linux-arm64-artifacts,
      build-macos-artifacts,
+      build-windows-artifacts,
      release-images-to-dockerhub,
    ]
    runs-on: ubuntu-20.04
--- a/.github/workflows/size-label.yml
+++ b/.github/workflows/size-label.yml
@@ -0,0 +1,26 @@
+name: size-labeler
+
+on: [pull_request]
+
+jobs:
+  labeler:
+    runs-on: ubuntu-latest
+    name: Label the PR size
+    steps:
+      - uses: codelytv/pr-size-labeler@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          s_label: 'Size: S'
+          s_max_size: '100'
+          m_label: 'Size: M'
+          m_max_size: '500'
+          l_label: 'Size: L'
+          l_max_size: '1000'
+          xl_label: 'Size: XL'
+          fail_if_xl: 'false'
+          message_if_xl: >
+            This PR exceeds the recommended size of 1000 lines.
+            Please make sure you are NOT addressing multiple issues with one PR.
+            Note this PR might be rejected due to its size.
+          github_api_url: 'api.github.com'
+          files_to_ignore: 'Cargo.lock'
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -204,7 +204,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72"

 [[package]]
 name = "api"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "common-base",
 "common-error",
@@ -666,7 +666,7 @@ dependencies = [

 [[package]]
 name = "auth"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-trait",
@@ -839,7 +839,7 @@ dependencies = [

 [[package]]
 name = "benchmarks"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "arrow",
 "chrono",
@@ -1222,7 +1222,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

 [[package]]
 name = "catalog"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arc-swap",
@@ -1506,7 +1506,7 @@ checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961"

 [[package]]
 name = "client"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arrow-flight",
@@ -1536,7 +1536,7 @@ dependencies = [
 "rand",
 "session",
 "snafu",
- "substrait 0.4.0",
+ "substrait 0.4.2",
 "substrait 0.7.5",
 "tokio",
 "tokio-stream",
@@ -1573,7 +1573,7 @@ dependencies = [

 [[package]]
 name = "cmd"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "anymap",
 "async-trait",
@@ -1621,7 +1621,7 @@ dependencies = [
 "servers",
 "session",
 "snafu",
- "substrait 0.4.0",
+ "substrait 0.4.2",
 "table",
 "temp-env",
 "tikv-jemallocator",
@@ -1654,7 +1654,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"

 [[package]]
 name = "common-base"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "anymap",
 "bitvec",
@@ -1669,7 +1669,7 @@ dependencies = [

 [[package]]
 name = "common-catalog"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "chrono",
 "common-error",
@@ -1682,7 +1682,7 @@ dependencies = [

 [[package]]
 name = "common-config"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "common-base",
 "humantime-serde",
@@ -1691,7 +1691,7 @@ dependencies = [

 [[package]]
 name = "common-datasource"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "arrow",
 "arrow-schema",
@@ -1720,7 +1720,7 @@ dependencies = [

 [[package]]
 name = "common-error"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "snafu",
 "strum 0.25.0",
@@ -1728,7 +1728,7 @@ dependencies = [

 [[package]]
 name = "common-function"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "arc-swap",
 "chrono-tz 0.6.3",
@@ -1751,7 +1751,7 @@ dependencies = [

 [[package]]
 name = "common-greptimedb-telemetry"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-trait",
 "common-error",
@@ -1770,7 +1770,7 @@ dependencies = [

 [[package]]
 name = "common-grpc"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arrow-flight",
@@ -1800,7 +1800,7 @@ dependencies = [

 [[package]]
 name = "common-grpc-expr"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-trait",
@@ -1819,7 +1819,7 @@ dependencies = [

 [[package]]
 name = "common-macro"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "arc-swap",
 "backtrace",
@@ -1836,7 +1836,7 @@ dependencies = [

 [[package]]
 name = "common-mem-prof"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "common-error",
 "common-macro",
@@ -1849,7 +1849,7 @@ dependencies = [

 [[package]]
 name = "common-meta"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arrow-flight",
@@ -1887,7 +1887,7 @@ dependencies = [

 [[package]]
 name = "common-procedure"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -1911,7 +1911,7 @@ dependencies = [

 [[package]]
 name = "common-procedure-test"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-trait",
 "common-procedure",
@@ -1919,7 +1919,7 @@ dependencies = [

 [[package]]
 name = "common-query"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-trait",
@@ -1942,7 +1942,7 @@ dependencies = [

 [[package]]
 name = "common-recordbatch"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "common-error",
 "common-macro",
@@ -1959,7 +1959,7 @@ dependencies = [

 [[package]]
 name = "common-runtime"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-trait",
 "common-error",
@@ -1976,7 +1976,7 @@ dependencies = [

 [[package]]
 name = "common-telemetry"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "backtrace",
 "common-error",
@@ -2003,7 +2003,7 @@ dependencies = [

 [[package]]
 name = "common-test-util"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "once_cell",
 "rand",
@@ -2012,7 +2012,7 @@ dependencies = [

 [[package]]
 name = "common-time"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "arrow",
 "chrono",
@@ -2027,7 +2027,7 @@ dependencies = [

 [[package]]
 name = "common-version"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "build-data",
 ]
@@ -2665,7 +2665,7 @@ dependencies = [

 [[package]]
 name = "datanode"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arrow-flight",
@@ -2724,7 +2724,7 @@ dependencies = [
 "sql",
 "storage",
 "store-api",
- "substrait 0.4.0",
+ "substrait 0.4.2",
 "table",
 "tokio",
 "tokio-stream",
@@ -2738,7 +2738,7 @@ dependencies = [

 [[package]]
 name = "datatypes"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "arrow",
 "arrow-array",
@@ -3201,7 +3201,7 @@ dependencies = [

 [[package]]
 name = "file-engine"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-trait",
@@ -3311,7 +3311,7 @@ dependencies = [

 [[package]]
 name = "frontend"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arc-swap",
@@ -3375,7 +3375,7 @@ dependencies = [
 "storage",
 "store-api",
 "strfmt",
- "substrait 0.4.0",
+ "substrait 0.4.2",
 "table",
 "tokio",
 "toml 0.7.6",
@@ -5006,7 +5006,7 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"

 [[package]]
 name = "log-store"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -5276,7 +5276,7 @@ dependencies = [

 [[package]]
 name = "meta-client"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-trait",
@@ -5306,7 +5306,7 @@ dependencies = [

 [[package]]
 name = "meta-srv"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "anymap",
 "api",
@@ -5498,7 +5498,7 @@ dependencies = [

 [[package]]
 name = "mito2"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "anymap",
 "api",
@@ -5960,11 +5960,13 @@ dependencies = [

 [[package]]
 name = "object-store"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "anyhow",
 "async-trait",
 "bytes",
+ "common-error",
+ "common-macro",
 "common-runtime",
 "common-telemetry",
 "common-test-util",
@@ -5973,6 +5975,7 @@ dependencies = [
 "metrics",
 "moka",
 "opendal",
+ "snafu",
 "tokio",
 "uuid",
 ]
@@ -6184,7 +6187,7 @@ dependencies = [

 [[package]]
 name = "operator"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-compat",
@@ -6229,7 +6232,7 @@ dependencies = [
 "sqlparser 0.34.0",
 "storage",
 "store-api",
- "substrait 0.4.0",
+ "substrait 0.4.2",
 "table",
 "tokio",
 "tonic 0.9.2",
@@ -6449,7 +6452,7 @@ dependencies = [

 [[package]]
 name = "partition"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-trait",
@@ -6775,7 +6778,7 @@ dependencies = [

 [[package]]
 name = "plugins"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "auth",
 "common-base",
@@ -7025,7 +7028,7 @@ dependencies = [

 [[package]]
 name = "promql"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-recursion",
 "async-trait",
@@ -7034,6 +7037,7 @@ dependencies = [
 "common-catalog",
 "common-error",
 "common-macro",
+ "common-recordbatch",
 "common-telemetry",
 "datafusion",
 "datatypes",
@@ -7287,7 +7291,7 @@ dependencies = [

 [[package]]
 name = "query"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "ahash 0.8.3",
 "api",
@@ -7344,7 +7348,7 @@ dependencies = [
 "stats-cli",
 "store-api",
 "streaming-stats",
- "substrait 0.4.0",
+ "substrait 0.4.2",
 "table",
 "tokio",
 "tokio-stream",
@@ -8543,7 +8547,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"

 [[package]]
 name = "script"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arc-swap",
@@ -8823,7 +8827,7 @@ dependencies = [

 [[package]]
 name = "servers"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "aide",
 "api",
@@ -8917,7 +8921,7 @@ dependencies = [

 [[package]]
 name = "session"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arc-swap",
@@ -9195,7 +9199,7 @@ dependencies = [

 [[package]]
 name = "sql"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "common-base",
@@ -9246,7 +9250,7 @@ dependencies = [

 [[package]]
 name = "sqlness-runner"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-trait",
 "clap 4.4.1",
@@ -9266,13 +9270,13 @@ dependencies = [
 [[package]]
 name = "sqlparser"
 version = "0.34.0"
-source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4#296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4"
+source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6cf9d23d5b8fbecd65efc1d9afb7e80ad7a424da#6cf9d23d5b8fbecd65efc1d9afb7e80ad7a424da"
 dependencies = [
 "lazy_static",
 "log",
 "regex",
 "sqlparser 0.35.0",
- "sqlparser_derive 0.1.1 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4)",
+ "sqlparser_derive 0.1.1 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6cf9d23d5b8fbecd65efc1d9afb7e80ad7a424da)",
 ]

 [[package]]
@@ -9299,7 +9303,7 @@ dependencies = [
 [[package]]
 name = "sqlparser_derive"
 version = "0.1.1"
-source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4#296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4"
+source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6cf9d23d5b8fbecd65efc1d9afb7e80ad7a424da#6cf9d23d5b8fbecd65efc1d9afb7e80ad7a424da"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -9452,7 +9456,7 @@ dependencies = [

 [[package]]
 name = "storage"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "arc-swap",
@@ -9506,7 +9510,7 @@ dependencies = [

 [[package]]
 name = "store-api"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "aquamarine",
@@ -9644,7 +9648,7 @@ dependencies = [

 [[package]]
 name = "substrait"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "async-recursion",
 "async-trait",
@@ -9802,7 +9806,7 @@ dependencies = [

 [[package]]
 name = "table"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "anymap",
 "async-trait",
@@ -9908,7 +9912,7 @@ dependencies = [

 [[package]]
 name = "tests-integration"
-version = "0.4.0"
+version = "0.4.2"
 dependencies = [
 "api",
 "async-trait",
@@ -9961,7 +9965,7 @@ dependencies = [
 "sql",
 "sqlx",
 "store-api",
- "substrait 0.4.0",
+ "substrait 0.4.2",
 "table",
 "tempfile",
 "tokio",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -55,7 +55,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "0.4.0"
+version = "0.4.2"
 edition = "2021"
 license = "Apache-2.0"

@@ -87,7 +87,7 @@ meter-core = { git = "https://github.com/GreptimeTeam/greptime-meter.git", rev =
 metrics = "0.20"
 moka = "0.12"
 once_cell = "1.18"
-opentelemetry-proto = { version = "0.2", features = ["gen-tonic", "metrics"] }
+opentelemetry-proto = { version = "0.2", features = ["gen-tonic", "metrics", "traces"] }
 parquet = "43.0"
 paste = "1.0"
 prost = "0.11"
@@ -103,7 +103,7 @@ serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 smallvec = "1"
 snafu = { version = "0.7", features = ["backtraces"] }
-sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4", features = [
+sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "6cf9d23d5b8fbecd65efc1d9afb7e80ad7a424da", features = [
    "visitor",
 ] }
 strum = { version = "0.25", features = ["derive"] }
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -82,6 +82,8 @@ enable = true

 # WAL options.
 [wal]
+# WAL data directory
+# dir = "/tmp/greptimedb/wal"
 # WAL file size in bytes.
 file_size = "256MB"
 # WAL purge threshold.
--- a/docker/dev-builder/ubuntu/Dockerfile
+++ b/docker/dev-builder/ubuntu/Dockerfile
@@ -19,8 +19,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    pkg-config \
    python3.10 \
-    python3.10-dev \
-    python3-pip
+    python3.10-dev
+
+# Remove Python 3.8 and install pip.
+RUN apt-get -y purge python3.8 && \
+    apt-get -y autoremove && \
+    ln -s /usr/bin/python3.10 /usr/bin/python3 && \
+    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10

 RUN git config --global --add safe.directory /greptimedb

--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -167,11 +167,14 @@ impl Database {
        }
    }

-    pub async fn sql(&self, sql: &str) -> Result<Output> {
+    pub async fn sql<S>(&self, sql: S) -> Result<Output>
+    where
+        S: AsRef<str>,
+    {
        let _timer = timer!(metrics::METRIC_GRPC_SQL);
        self.do_get(
            Request::Query(QueryRequest {
-                query: Some(Query::Sql(sql.to_string())),
+                query: Some(Query::Sql(sql.as_ref().to_string())),
            }),
            0,
        )
--- a/src/cmd/src/cli.rs
+++ b/src/cmd/src/cli.rs
@@ -14,6 +14,7 @@

 mod bench;
 mod cmd;
+mod export;
 mod helper;
 mod repl;
 // TODO(weny): Removes it
@@ -27,6 +28,7 @@ use common_telemetry::logging::LoggingOptions;
 pub use repl::Repl;
 use upgrade::UpgradeCommand;

+use self::export::ExportCommand;
 use crate::error::Result;
 use crate::options::{Options, TopLevelOptions};

@@ -81,6 +83,7 @@ enum SubCommand {
    // Attach(AttachCommand),
    Upgrade(UpgradeCommand),
    Bench(BenchTableMetadataCommand),
+    Export(ExportCommand),
 }

 impl SubCommand {
@@ -89,6 +92,7 @@ impl SubCommand {
            // SubCommand::Attach(cmd) => cmd.build().await,
            SubCommand::Upgrade(cmd) => cmd.build().await,
            SubCommand::Bench(cmd) => cmd.build().await,
+            SubCommand::Export(cmd) => cmd.build().await,
        }
    }
 }
--- a/src/cmd/src/cli/export.rs
+++ b/src/cmd/src/cli/export.rs
@@ -0,0 +1,395 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::path::Path;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use clap::{Parser, ValueEnum};
+use client::{Client, Database, DEFAULT_SCHEMA_NAME};
+use common_query::Output;
+use common_recordbatch::util::collect;
+use common_telemetry::{debug, error, info, warn};
+use datatypes::scalars::ScalarVector;
+use datatypes::vectors::{StringVector, Vector};
+use snafu::{OptionExt, ResultExt};
+use tokio::fs::File;
+use tokio::io::AsyncWriteExt;
+use tokio::sync::Semaphore;
+
+use crate::cli::{Instance, Tool};
+use crate::error::{
+    CollectRecordBatchesSnafu, ConnectServerSnafu, EmptyResultSnafu, Error, FileIoSnafu,
+    InvalidDatabaseNameSnafu, NotDataFromOutputSnafu, RequestDatabaseSnafu, Result,
+};
+
+type TableReference = (String, String, String);
+
+#[derive(Debug, Default, Clone, ValueEnum)]
+enum ExportTarget {
+    /// Corresponding to `SHOW CREATE TABLE`
+    #[default]
+    CreateTable,
+    /// Corresponding to `EXPORT TABLE`
+    TableData,
+}
+
+#[derive(Debug, Default, Parser)]
+pub struct ExportCommand {
+    /// Server address to connect
+    #[clap(long)]
+    addr: String,
+
+    /// Directory to put the exported data. E.g.: /tmp/greptimedb-export
+    #[clap(long)]
+    output_dir: String,
+
+    /// The name of the catalog to export. Default to "greptime-*"".
+    #[clap(long, default_value = "")]
+    database: String,
+
+    /// Parallelism of the export.
+    #[clap(long, short = 'j', default_value = "1")]
+    export_jobs: usize,
+
+    /// Max retry times for each job.
+    #[clap(long, default_value = "3")]
+    max_retry: usize,
+
+    /// Things to export
+    #[clap(long, short = 't', value_enum)]
+    target: ExportTarget,
+}
+
+impl ExportCommand {
+    pub async fn build(&self) -> Result<Instance> {
+        let client = Client::with_urls([self.addr.clone()]);
+        client
+            .health_check()
+            .await
+            .with_context(|_| ConnectServerSnafu {
+                addr: self.addr.clone(),
+            })?;
+        let (catalog, schema) = split_database(&self.database)?;
+        let database_client = Database::new(
+            catalog.clone(),
+            schema.clone().unwrap_or(DEFAULT_SCHEMA_NAME.to_string()),
+            client,
+        );
+
+        Ok(Instance::Tool(Box::new(Export {
+            client: database_client,
+            catalog,
+            schema,
+            output_dir: self.output_dir.clone(),
+            parallelism: self.export_jobs,
+            target: self.target.clone(),
+        })))
+    }
+}
+
+pub struct Export {
+    client: Database,
+    catalog: String,
+    schema: Option<String>,
+    output_dir: String,
+    parallelism: usize,
+    target: ExportTarget,
+}
+
+impl Export {
+    /// Iterate over all db names.
+    ///
+    /// Newbie: `db_name` is catalog + schema.
+    async fn iter_db_names(&self) -> Result<Vec<(String, String)>> {
+        if let Some(schema) = &self.schema {
+            Ok(vec![(self.catalog.clone(), schema.clone())])
+        } else {
+            let mut client = self.client.clone();
+            client.set_catalog(self.catalog.clone());
+            let result =
+                client
+                    .sql("show databases")
+                    .await
+                    .with_context(|_| RequestDatabaseSnafu {
+                        sql: "show databases".to_string(),
+                    })?;
+            let Output::Stream(stream) = result else {
+                NotDataFromOutputSnafu.fail()?
+            };
+            let record_batch = collect(stream)
+                .await
+                .context(CollectRecordBatchesSnafu)?
+                .pop()
+                .context(EmptyResultSnafu)?;
+            let schemas = record_batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<StringVector>()
+                .unwrap();
+            let mut result = Vec::with_capacity(schemas.len());
+            for i in 0..schemas.len() {
+                let schema = schemas.get_data(i).unwrap().to_owned();
+                result.push((self.catalog.clone(), schema));
+            }
+            Ok(result)
+        }
+    }
+
+    /// Return a list of [`TableReference`] to be exported.
+    /// Includes all tables under the given `catalog` and `schema`
+    async fn get_table_list(&self, catalog: &str, schema: &str) -> Result<Vec<TableReference>> {
+        // TODO: SQL injection hurts
+        let sql = format!(
+            "select table_catalog, table_schema, table_name from \
+            information_schema.tables where table_type = \'BASE TABLE\'\
+            and table_catalog = \'{catalog}\' and table_schema = \'{schema}\'",
+        );
+        let mut client = self.client.clone();
+        client.set_catalog(catalog);
+        client.set_schema(schema);
+        let result = client
+            .sql(&sql)
+            .await
+            .with_context(|_| RequestDatabaseSnafu { sql })?;
+        let Output::Stream(stream) = result else {
+            NotDataFromOutputSnafu.fail()?
+        };
+        let Some(record_batch) = collect(stream)
+            .await
+            .context(CollectRecordBatchesSnafu)?
+            .pop()
+        else {
+            return Ok(vec![]);
+        };
+
+        debug!("Fetched table list: {}", record_batch.pretty_print());
+
+        if record_batch.num_rows() == 0 {
+            return Ok(vec![]);
+        }
+
+        let mut result = Vec::with_capacity(record_batch.num_rows());
+        let catalog_column = record_batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringVector>()
+            .unwrap();
+        let schema_column = record_batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<StringVector>()
+            .unwrap();
+        let table_column = record_batch
+            .column(2)
+            .as_any()
+            .downcast_ref::<StringVector>()
+            .unwrap();
+        for i in 0..record_batch.num_rows() {
+            let catalog = catalog_column.get_data(i).unwrap().to_owned();
+            let schema = schema_column.get_data(i).unwrap().to_owned();
+            let table = table_column.get_data(i).unwrap().to_owned();
+            result.push((catalog, schema, table));
+        }
+
+        Ok(result)
+    }
+
+    async fn show_create_table(&self, catalog: &str, schema: &str, table: &str) -> Result<String> {
+        let sql = format!("show create table {}.{}.{}", catalog, schema, table);
+        let mut client = self.client.clone();
+        client.set_catalog(catalog);
+        client.set_schema(schema);
+        let result = client
+            .sql(&sql)
+            .await
+            .with_context(|_| RequestDatabaseSnafu { sql })?;
+        let Output::Stream(stream) = result else {
+            NotDataFromOutputSnafu.fail()?
+        };
+        let record_batch = collect(stream)
+            .await
+            .context(CollectRecordBatchesSnafu)?
+            .pop()
+            .context(EmptyResultSnafu)?;
+        let create_table = record_batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<StringVector>()
+            .unwrap()
+            .get_data(0)
+            .unwrap();
+
+        Ok(format!("{create_table};\n"))
+    }
+
+    async fn export_create_table(&self) -> Result<()> {
+        let semaphore = Arc::new(Semaphore::new(self.parallelism));
+        let db_names = self.iter_db_names().await?;
+        let db_count = db_names.len();
+        let mut tasks = Vec::with_capacity(db_names.len());
+        for (catalog, schema) in db_names {
+            let semaphore_moved = semaphore.clone();
+            tasks.push(async move {
+                let _permit = semaphore_moved.acquire().await.unwrap();
+                let table_list = self.get_table_list(&catalog, &schema).await?;
+                let table_count = table_list.len();
+                tokio::fs::create_dir_all(&self.output_dir)
+                    .await
+                    .context(FileIoSnafu)?;
+                let output_file =
+                    Path::new(&self.output_dir).join(format!("{catalog}-{schema}.sql"));
+                let mut file = File::create(output_file).await.context(FileIoSnafu)?;
+                for (c, s, t) in table_list {
+                    match self.show_create_table(&c, &s, &t).await {
+                        Err(e) => {
+                            error!(e; "Failed to export table {}.{}.{}", c, s, t)
+                        }
+                        Ok(create_table) => {
+                            file.write_all(create_table.as_bytes())
+                                .await
+                                .context(FileIoSnafu)?;
+                        }
+                    }
+                }
+                info!("finished exporting {catalog}.{schema} with {table_count} tables",);
+                Ok::<(), Error>(())
+            });
+        }
+
+        let success = futures::future::join_all(tasks)
+            .await
+            .into_iter()
+            .filter(|r| match r {
+                Ok(_) => true,
+                Err(e) => {
+                    error!(e; "export job failed");
+                    false
+                }
+            })
+            .count();
+
+        info!("success {success}/{db_count} jobs");
+
+        Ok(())
+    }
+
+    async fn export_table_data(&self) -> Result<()> {
+        let semaphore = Arc::new(Semaphore::new(self.parallelism));
+        let db_names = self.iter_db_names().await?;
+        let db_count = db_names.len();
+        let mut tasks = Vec::with_capacity(db_names.len());
+        for (catalog, schema) in db_names {
+            let semaphore_moved = semaphore.clone();
+            tasks.push(async move {
+                let _permit = semaphore_moved.acquire().await.unwrap();
+                tokio::fs::create_dir_all(&self.output_dir)
+                    .await
+                    .context(FileIoSnafu)?;
+                let output_dir = Path::new(&self.output_dir).join(format!("{catalog}-{schema}/"));
+
+                let mut client = self.client.clone();
+                client.set_catalog(catalog.clone());
+                client.set_schema(schema.clone());
+
+                // copy database to
+                let sql = format!(
+                    "copy database {} to '{}' with (format='parquet');",
+                    schema,
+                    output_dir.to_str().unwrap()
+                );
+                client
+                    .sql(sql.clone())
+                    .await
+                    .context(RequestDatabaseSnafu { sql })?;
+                info!("finished exporting {catalog}.{schema} data");
+
+                // export copy from sql
+                let dir_filenames = match output_dir.read_dir() {
+                    Ok(dir) => dir,
+                    Err(_) => {
+                        warn!("empty database {catalog}.{schema}");
+                        return Ok(());
+                    }
+                };
+
+                let copy_from_file =
+                    Path::new(&self.output_dir).join(format!("{catalog}-{schema}_copy_from.sql"));
+                let mut file = File::create(copy_from_file).await.context(FileIoSnafu)?;
+
+                let copy_from_sql = dir_filenames
+                    .into_iter()
+                    .map(|file| {
+                        let file = file.unwrap();
+                        let filename = file.file_name().into_string().unwrap();
+
+                        format!(
+                            "copy {} from '{}' with (format='parquet');\n",
+                            filename.replace(".parquet", ""),
+                            file.path().to_str().unwrap()
+                        )
+                    })
+                    .collect::<Vec<_>>()
+                    .join("");
+                file.write_all(copy_from_sql.as_bytes())
+                    .await
+                    .context(FileIoSnafu)?;
+
+                info!("finished exporting {catalog}.{schema} copy_from.sql");
+
+                Ok::<(), Error>(())
+            });
+        }
+
+        let success = futures::future::join_all(tasks)
+            .await
+            .into_iter()
+            .filter(|r| match r {
+                Ok(_) => true,
+                Err(e) => {
+                    error!(e; "export job failed");
+                    false
+                }
+            })
+            .count();
+
+        info!("success {success}/{db_count} jobs");
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl Tool for Export {
+    async fn do_work(&self) -> Result<()> {
+        match self.target {
+            ExportTarget::CreateTable => self.export_create_table().await,
+            ExportTarget::TableData => self.export_table_data().await,
+        }
+    }
+}
+
+/// Split at `-`.
+fn split_database(database: &str) -> Result<(String, Option<String>)> {
+    let (catalog, schema) = database
+        .split_once('-')
+        .with_context(|| InvalidDatabaseNameSnafu {
+            database: database.to_string(),
+        })?;
+    if schema == "*" {
+        Ok((catalog.to_string(), None))
+    } else {
+        Ok((catalog.to_string(), Some(schema.to_string())))
+    }
+}
--- a/src/cmd/src/datanode.rs
+++ b/src/cmd/src/datanode.rs
@@ -96,6 +96,8 @@ struct StartCommand {
    #[clap(long)]
    data_home: Option<String>,
    #[clap(long)]
+    wal_dir: Option<String>,
+    #[clap(long)]
    http_addr: Option<String>,
    #[clap(long)]
    http_timeout: Option<u64>,
@@ -149,6 +151,10 @@ impl StartCommand {
            opts.storage.data_home = data_home.clone();
        }

+        if let Some(wal_dir) = &self.wal_dir {
+            opts.wal.dir = Some(wal_dir.clone());
+        }
+
        if let Some(http_addr) = &self.http_addr {
            opts.http.addr = http_addr.clone();
        }
@@ -255,6 +261,7 @@ mod tests {

        assert_eq!("127.0.0.1:3001".to_string(), options.rpc_addr);
        assert_eq!(Some(42), options.node_id);
+        assert_eq!("/other/wal", options.wal.dir.unwrap());

        assert_eq!(Duration::from_secs(600), options.wal.purge_interval);
        assert_eq!(1024 * 1024 * 1024, options.wal.file_size.0);
@@ -439,6 +446,7 @@ mod tests {
            || {
                let command = StartCommand {
                    config_file: Some(file.path().to_str().unwrap().to_string()),
+                    wal_dir: Some("/other/wal/dir".to_string()),
                    env_prefix: env_prefix.to_string(),
                    ..Default::default()
                };
@@ -466,6 +474,9 @@ mod tests {
                // Should be read from config file, config file > env > default values.
                assert_eq!(opts.storage.compaction.max_purge_tasks, 32);

+                // Should be read from cli, cli > config file > env > default values.
+                assert_eq!(opts.wal.dir.unwrap(), "/other/wal/dir");
+
                // Should be default value.
                assert_eq!(
                    opts.storage.manifest.checkpoint_margin,
--- a/src/cmd/src/error.rs
+++ b/src/cmd/src/error.rs
@@ -37,6 +37,18 @@ pub enum Error {
        source: common_meta::error::Error,
    },

+    #[snafu(display("Failed to start procedure manager"))]
+    StartProcedureManager {
+        location: Location,
+        source: common_procedure::error::Error,
+    },
+
+    #[snafu(display("Failed to stop procedure manager"))]
+    StopProcedureManager {
+        location: Location,
+        source: common_procedure::error::Error,
+    },
+
    #[snafu(display("Failed to start datanode"))]
    StartDatanode {
        location: Location,
@@ -174,12 +186,39 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Failed to connect server at {addr}"))]
+    ConnectServer {
+        addr: String,
+        source: client::error::Error,
+        location: Location,
+    },
+
    #[snafu(display("Failed to serde json"))]
    SerdeJson {
        #[snafu(source)]
        error: serde_json::error::Error,
        location: Location,
    },
+
+    #[snafu(display("Expect data from output, but got another thing"))]
+    NotDataFromOutput { location: Location },
+
+    #[snafu(display("Empty result from output"))]
+    EmptyResult { location: Location },
+
+    #[snafu(display("Failed to manipulate file"))]
+    FileIo {
+        location: Location,
+        #[snafu(source)]
+        error: std::io::Error,
+    },
+
+    #[snafu(display("Invalid database name: {}", database))]
+    InvalidDatabaseName {
+        location: Location,
+        database: String,
+    },
+
    #[snafu(display("Failed to create directory {}", dir))]
    CreateDir {
        dir: String,
@@ -204,13 +243,18 @@ impl ErrorExt for Error {
            Error::IterStream { source, .. } | Error::InitMetadata { source, .. } => {
                source.status_code()
            }
+            Error::ConnectServer { source, .. } => source.status_code(),
            Error::MissingConfig { .. }
            | Error::LoadLayeredConfig { .. }
            | Error::IllegalConfig { .. }
            | Error::InvalidReplCommand { .. }
+            | Error::ConnectEtcd { .. }
+            | Error::NotDataFromOutput { .. }
            | Error::CreateDir { .. }
-            | Error::ConnectEtcd { .. } => StatusCode::InvalidArguments,
-
+            | Error::EmptyResult { .. }
+            | Error::InvalidDatabaseName { .. } => StatusCode::InvalidArguments,
+            Error::StartProcedureManager { source, .. }
+            | Error::StopProcedureManager { source, .. } => source.status_code(),
            Error::ReplCreation { .. } | Error::Readline { .. } => StatusCode::Internal,
            Error::RequestDatabase { source, .. } => source.status_code(),
            Error::CollectRecordBatches { source, .. }
@@ -222,7 +266,7 @@ impl ErrorExt for Error {
            Error::SubstraitEncodeLogicalPlan { source, .. } => source.status_code(),
            Error::StartCatalogManager { source, .. } => source.status_code(),

-            Error::SerdeJson { .. } => StatusCode::Unexpected,
+            Error::SerdeJson { .. } | Error::FileIo { .. } => StatusCode::Unexpected,
        }
    }

--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -89,7 +89,7 @@ pub struct StartCommand {
    #[clap(long)]
    http_timeout: Option<u64>,
    #[clap(long)]
-    grpc_addr: Option<String>,
+    rpc_addr: Option<String>,
    #[clap(long)]
    mysql_addr: Option<String>,
    #[clap(long)]
@@ -150,7 +150,7 @@ impl StartCommand {
            opts.http.disable_dashboard = disable_dashboard;
        }

-        if let Some(addr) = &self.grpc_addr {
+        if let Some(addr) = &self.rpc_addr {
            opts.grpc.addr = addr.clone()
        }

--- a/src/cmd/src/options.rs
+++ b/src/cmd/src/options.rs
@@ -263,6 +263,9 @@ mod tests {
                    ]
                );

+                // Should be the values from config file, not environment variables.
+                assert_eq!(opts.wal.dir.unwrap(), "/tmp/greptimedb/wal");
+
                // Should be default values.
                assert_eq!(opts.node_id, None);
            },
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -43,7 +43,8 @@ use snafu::ResultExt;

 use crate::error::{
    CreateDirSnafu, IllegalConfigSnafu, InitMetadataSnafu, Result, ShutdownDatanodeSnafu,
-    ShutdownFrontendSnafu, StartDatanodeSnafu, StartFrontendSnafu,
+    ShutdownFrontendSnafu, StartDatanodeSnafu, StartFrontendSnafu, StartProcedureManagerSnafu,
+    StopProcedureManagerSnafu,
 };
 use crate::options::{MixOptions, Options, TopLevelOptions};

@@ -163,6 +164,7 @@ impl StandaloneOptions {
 pub struct Instance {
    datanode: Datanode,
    frontend: FeInstance,
+    procedure_manager: ProcedureManagerRef,
 }

 impl Instance {
@@ -171,6 +173,11 @@ impl Instance {
        self.datanode.start().await.context(StartDatanodeSnafu)?;
        info!("Datanode instance started");

+        self.procedure_manager
+            .start()
+            .await
+            .context(StartProcedureManagerSnafu)?;
+
        self.frontend.start().await.context(StartFrontendSnafu)?;
        Ok(())
    }
@@ -181,6 +188,11 @@ impl Instance {
            .await
            .context(ShutdownFrontendSnafu)?;

+        self.procedure_manager
+            .stop()
+            .await
+            .context(StopProcedureManagerSnafu)?;
+
        self.datanode
            .shutdown()
            .await
@@ -354,7 +366,7 @@ impl StartCommand {
        let mut frontend = build_frontend(
            fe_plugins,
            kv_store,
-            procedure_manager,
+            procedure_manager.clone(),
            catalog_manager,
            region_server,
        )
@@ -365,7 +377,11 @@ impl StartCommand {
            .await
            .context(StartFrontendSnafu)?;

-        Ok(Instance { datanode, frontend })
+        Ok(Instance {
+            datanode,
+            frontend,
+            procedure_manager,
+        })
    }
 }

@@ -493,6 +509,8 @@ mod tests {
        assert_eq!(None, fe_opts.mysql.reject_no_database);
        assert!(fe_opts.influxdb.enable);

+        assert_eq!("/tmp/greptimedb/test/wal", dn_opts.wal.dir.unwrap());
+
        match &dn_opts.storage.store {
            datanode::config::ObjectStoreConfig::S3(s3_config) => {
                assert_eq!(
--- a/src/common/base/src/readable_size.rs
+++ b/src/common/base/src/readable_size.rs
@@ -14,10 +14,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// This file is copied from https://github.com/tikv/raft-engine/blob/8dd2a39f359ff16f5295f35343f626e0c10132fa/src/util.rs without any modification.
+// This file is copied from https://github.com/tikv/raft-engine/blob/8dd2a39f359ff16f5295f35343f626e0c10132fa/src/util.rs

-use std::fmt;
-use std::fmt::{Display, Write};
+use std::fmt::{self, Debug, Display, Write};
 use std::ops::{Div, Mul};
 use std::str::FromStr;

@@ -34,7 +33,7 @@ pub const GIB: u64 = MIB * BINARY_DATA_MAGNITUDE;
 pub const TIB: u64 = GIB * BINARY_DATA_MAGNITUDE;
 pub const PIB: u64 = TIB * BINARY_DATA_MAGNITUDE;

-#[derive(Clone, Debug, Copy, PartialEq, Eq, PartialOrd)]
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
 pub struct ReadableSize(pub u64);

 impl ReadableSize {
@@ -155,6 +154,12 @@ impl FromStr for ReadableSize {
    }
 }

+impl Debug for ReadableSize {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self)
+    }
+}
+
 impl Display for ReadableSize {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if self.0 >= PIB {
--- a/src/common/config/src/lib.rs
+++ b/src/common/config/src/lib.rs
@@ -20,6 +20,8 @@ use serde::{Deserialize, Serialize};
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 #[serde(default)]
 pub struct WalConfig {
+    // wal directory
+    pub dir: Option<String>,
    // wal file size in bytes
    pub file_size: ReadableSize,
    // wal purge threshold in bytes
@@ -36,7 +38,8 @@ pub struct WalConfig {
 impl Default for WalConfig {
    fn default() -> Self {
        Self {
-            file_size: ReadableSize::mb(256),     // log file size 256MB
+            dir: None,
+            file_size: ReadableSize::mb(256), // log file size 256MB
            purge_threshold: ReadableSize::gb(4), // purge threshold 4GB
            purge_interval: Duration::from_secs(600),
            read_batch_size: 128,
--- a/src/common/error/src/ext.rs
+++ b/src/common/error/src/ext.rs
@@ -39,17 +39,25 @@ pub trait ErrorExt: StackError {
    where
        Self: Sized,
    {
-        let error = self.last();
-        if let Some(external_error) = error.source() {
-            let external_root = external_error.sources().last().unwrap();
-
-            if error.to_string().is_empty() {
-                format!("{external_root}")
-            } else {
-                format!("{error}: {external_root}")
+        match self.status_code() {
+            StatusCode::Unknown | StatusCode::Unexpected | StatusCode::Internal => {
+                // masks internal error from end user
+                format!("Internal error: {}", self.status_code() as u32)
+            }
+            _ => {
+                let error = self.last();
+                if let Some(external_error) = error.source() {
+                    let external_root = external_error.sources().last().unwrap();
+
+                    if error.to_string().is_empty() {
+                        format!("{external_root}")
+                    } else {
+                        format!("{error}: {external_root}")
+                    }
+                } else {
+                    format!("{error}")
+                }
            }
-        } else {
-            format!("{error}")
        }
    }
 }
--- a/src/common/function/src/scalars/math.rs
+++ b/src/common/function/src/scalars/math.rs
@@ -58,9 +58,15 @@ impl Function for RangeFunction {
        "range_fn"
    }

-    // range_fn will never been used, return_type could be arbitrary value, is not important
-    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
-        Ok(ConcreteDataType::float64_datatype())
+    // The first argument to range_fn is the expression to be evaluated
+    fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        input_types
+            .first()
+            .cloned()
+            .ok_or(DataFusionError::Internal(
+                "No expr found in range_fn".into(),
+            ))
+            .context(GeneralDataFusionSnafu)
    }

    /// `range_fn` will never been used. As long as a legal signature is returned, the specific content of the signature does not matter.
--- a/src/common/greptimedb-telemetry/src/lib.rs
+++ b/src/common/greptimedb-telemetry/src/lib.rs
@@ -15,6 +15,8 @@
 use std::env;
 use std::io::ErrorKind;
 use std::path::{Path, PathBuf};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
 use std::time::Duration;

 use common_runtime::error::{Error, Result};
@@ -24,7 +26,7 @@ use reqwest::{Client, Response};
 use serde::{Deserialize, Serialize};

 /// The URL to report telemetry data.
-pub const TELEMETRY_URL: &str = "https://api.greptime.cloud/db/otel/statistics";
+pub const TELEMETRY_URL: &str = "https://telemetry.greptimestats.com/db/otel/statistics";
 /// The local installation uuid cache file
 const UUID_FILE_NAME: &str = ".greptimedb-telemetry-uuid";

@@ -36,13 +38,29 @@ const GREPTIMEDB_TELEMETRY_CLIENT_CONNECT_TIMEOUT: Duration = Duration::from_sec
 const GREPTIMEDB_TELEMETRY_CLIENT_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);

 pub enum GreptimeDBTelemetryTask {
-    Enable(RepeatedTask<Error>),
+    Enable((RepeatedTask<Error>, Arc<AtomicBool>)),
    Disable,
 }

 impl GreptimeDBTelemetryTask {
-    pub fn enable(interval: Duration, task_fn: BoxedTaskFunction<Error>) -> Self {
-        GreptimeDBTelemetryTask::Enable(RepeatedTask::new(interval, task_fn))
+    pub fn should_report(&self, value: bool) {
+        match self {
+            GreptimeDBTelemetryTask::Enable((_, should_report)) => {
+                should_report.store(value, Ordering::Relaxed);
+            }
+            GreptimeDBTelemetryTask::Disable => {}
+        }
+    }
+
+    pub fn enable(
+        interval: Duration,
+        task_fn: BoxedTaskFunction<Error>,
+        should_report: Arc<AtomicBool>,
+    ) -> Self {
+        GreptimeDBTelemetryTask::Enable((
+            RepeatedTask::new(interval, task_fn).with_initial_delay(Some(Duration::ZERO)),
+            should_report,
+        ))
    }

    pub fn disable() -> Self {
@@ -51,7 +69,7 @@ impl GreptimeDBTelemetryTask {

    pub fn start(&self) -> Result<()> {
        match self {
-            GreptimeDBTelemetryTask::Enable(task) => {
+            GreptimeDBTelemetryTask::Enable((task, _)) => {
                print_anonymous_usage_data_disclaimer();
                task.start(common_runtime::bg_runtime())
            }
@@ -61,7 +79,7 @@ impl GreptimeDBTelemetryTask {

    pub async fn stop(&self) -> Result<()> {
        match self {
-            GreptimeDBTelemetryTask::Enable(task) => task.stop().await,
+            GreptimeDBTelemetryTask::Enable((task, _)) => task.stop().await,
            GreptimeDBTelemetryTask::Disable => Ok(()),
        }
    }
@@ -191,6 +209,7 @@ pub struct GreptimeDBTelemetry {
    client: Option<Client>,
    working_home: Option<String>,
    telemetry_url: &'static str,
+    should_report: Arc<AtomicBool>,
 }

 #[async_trait::async_trait]
@@ -200,13 +219,19 @@ impl TaskFunction<Error> for GreptimeDBTelemetry {
    }

    async fn call(&mut self) -> Result<()> {
-        self.report_telemetry_info().await;
+        if self.should_report.load(Ordering::Relaxed) {
+            self.report_telemetry_info().await;
+        }
        Ok(())
    }
 }

 impl GreptimeDBTelemetry {
-    pub fn new(working_home: Option<String>, statistics: Box<dyn Collector + Send + Sync>) -> Self {
+    pub fn new(
+        working_home: Option<String>,
+        statistics: Box<dyn Collector + Send + Sync>,
+        should_report: Arc<AtomicBool>,
+    ) -> Self {
        let client = Client::builder()
            .connect_timeout(GREPTIMEDB_TELEMETRY_CLIENT_CONNECT_TIMEOUT)
            .timeout(GREPTIMEDB_TELEMETRY_CLIENT_REQUEST_TIMEOUT)
@@ -216,6 +241,7 @@ impl GreptimeDBTelemetry {
            statistics,
            client: client.ok(),
            telemetry_url: TELEMETRY_URL,
+            should_report,
        }
    }

@@ -250,7 +276,8 @@ impl GreptimeDBTelemetry {
 mod tests {
    use std::convert::Infallible;
    use std::env;
-    use std::sync::atomic::AtomicUsize;
+    use std::sync::atomic::{AtomicBool, AtomicUsize};
+    use std::sync::Arc;
    use std::time::Duration;

    use common_test_util::ports;
@@ -370,7 +397,11 @@ mod tests {
        let working_home = working_home_temp.path().to_str().unwrap().to_string();

        let test_statistic = Box::new(TestStatistic);
-        let mut test_report = GreptimeDBTelemetry::new(Some(working_home.clone()), test_statistic);
+        let mut test_report = GreptimeDBTelemetry::new(
+            Some(working_home.clone()),
+            test_statistic,
+            Arc::new(AtomicBool::new(true)),
+        );
        let url = Box::leak(format!("{}:{}", "http://localhost", port).into_boxed_str());
        test_report.telemetry_url = url;
        let response = test_report.report_telemetry_info().await.unwrap();
@@ -384,7 +415,11 @@ mod tests {
        assert_eq!(1, body.nodes.unwrap());

        let failed_statistic = Box::new(FailedStatistic);
-        let mut failed_report = GreptimeDBTelemetry::new(Some(working_home), failed_statistic);
+        let mut failed_report = GreptimeDBTelemetry::new(
+            Some(working_home),
+            failed_statistic,
+            Arc::new(AtomicBool::new(true)),
+        );
        failed_report.telemetry_url = url;
        let response = failed_report.report_telemetry_info().await;
        assert!(response.is_none());
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -258,7 +258,6 @@ impl<T: Serialize + DeserializeOwned> DeserializedValueWithBytes<T> {
        self.bytes.to_vec()
    }

-    #[cfg(feature = "testing")]
    /// Notes: used for test purpose.
    pub fn from_inner(inner: T) -> Self {
        let bytes = serde_json::to_vec(&inner).unwrap();
--- a/src/common/meta/src/kv_backend.rs
+++ b/src/common/meta/src/kv_backend.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 pub mod memory;
+pub mod test;
 pub mod txn;

 use std::any::Any;
--- a/src/common/meta/src/kv_backend/memory.rs
+++ b/src/common/meta/src/kv_backend/memory.rs
@@ -17,7 +17,6 @@ use std::collections::btree_map::Entry;
 use std::collections::BTreeMap;
 use std::fmt::{Display, Formatter};
 use std::marker::PhantomData;
-use std::ops::Range;
 use std::sync::RwLock;

 use async_trait::async_trait;
@@ -85,21 +84,25 @@ impl<T: ErrorExt + Send + Sync + 'static> KvBackend for MemoryKvBackend<T> {
    }

    async fn range(&self, req: RangeRequest) -> Result<RangeResponse, Self::Error> {
+        let range = req.range();
        let RangeRequest {
-            key,
-            range_end,
-            limit,
-            keys_only,
+            limit, keys_only, ..
        } = req;

        let kvs = self.kvs.read().unwrap();
+        let values = kvs.range(range);

-        let iter: Box<dyn Iterator<Item = (&Vec<u8>, &Vec<u8>)>> = if range_end.is_empty() {
-            Box::new(kvs.get_key_value(&key).into_iter())
-        } else {
-            Box::new(kvs.range(key..range_end))
-        };
-        let mut kvs = iter
+        let mut more = false;
+        let mut iter: i64 = 0;
+
+        let kvs = values
+            .take_while(|_| {
+                let take = limit == 0 || iter != limit;
+                iter += 1;
+                more = limit > 0 && iter > limit;
+
+                take
+            })
            .map(|(k, v)| {
                let key = k.clone();
                let value = if keys_only { vec![] } else { v.clone() };
@@ -107,13 +110,6 @@ impl<T: ErrorExt + Send + Sync + 'static> KvBackend for MemoryKvBackend<T> {
            })
            .collect::<Vec<_>>();

-        let more = if limit > 0 && kvs.len() > limit as usize {
-            kvs.truncate(limit as usize);
-            true
-        } else {
-            false
-        };
-
        Ok(RangeResponse { kvs, more })
    }

@@ -215,36 +211,32 @@ impl<T: ErrorExt + Send + Sync + 'static> KvBackend for MemoryKvBackend<T> {
        &self,
        req: DeleteRangeRequest,
    ) -> Result<DeleteRangeResponse, Self::Error> {
-        let DeleteRangeRequest {
-            key,
-            range_end,
-            prev_kv,
-        } = req;
+        let range = req.range();
+        let DeleteRangeRequest { prev_kv, .. } = req;

        let mut kvs = self.kvs.write().unwrap();

-        let prev_kvs = if range_end.is_empty() {
-            kvs.remove(&key)
-                .into_iter()
-                .map(|value| KeyValue {
-                    key: key.clone(),
-                    value,
-                })
-                .collect::<Vec<_>>()
-        } else {
-            let range = Range {
-                start: key,
-                end: range_end,
-            };
-            kvs.extract_if(|key, _| range.contains(key))
-                .map(Into::into)
-                .collect::<Vec<_>>()
-        };
+        let keys = kvs
+            .range(range)
+            .map(|(key, _)| key.clone())
+            .collect::<Vec<_>>();

-        Ok(DeleteRangeResponse {
-            deleted: prev_kvs.len() as i64,
-            prev_kvs: if prev_kv { prev_kvs } else { vec![] },
-        })
+        let mut prev_kvs = if prev_kv {
+            Vec::with_capacity(keys.len())
+        } else {
+            vec![]
+        };
+        let deleted = keys.len() as i64;
+
+        for key in keys {
+            if let Some(value) = kvs.remove(&key) {
+                if prev_kv {
+                    prev_kvs.push((key.clone(), value).into())
+                }
+            }
+        }
+
+        Ok(DeleteRangeResponse { deleted, prev_kvs })
    }

    async fn batch_delete(
@@ -358,254 +350,63 @@ impl<T: ErrorExt + Send + Sync> TxnService for MemoryKvBackend<T> {

 #[cfg(test)]
 mod tests {
-    use std::sync::atomic::{AtomicU8, Ordering};
    use std::sync::Arc;

    use super::*;
    use crate::error::Error;
+    use crate::kv_backend::test::{
+        prepare_kv, test_kv_batch_delete, test_kv_batch_get, test_kv_compare_and_put,
+        test_kv_delete_range, test_kv_put, test_kv_range, test_kv_range_2,
+    };
    use crate::kv_backend::KvBackend;
-    use crate::rpc::store::{BatchGetRequest, BatchPutRequest};
-    use crate::rpc::KeyValue;
-    use crate::util;

    async fn mock_mem_store_with_data() -> MemoryKvBackend<Error> {
        let kv_store = MemoryKvBackend::<Error>::new();
-        let kvs = mock_kvs();
-
-        assert!(kv_store
-            .batch_put(BatchPutRequest {
-                kvs,
-                ..Default::default()
-            })
-            .await
-            .is_ok());
-
-        assert!(kv_store
-            .put(PutRequest {
-                key: b"key11".to_vec(),
-                value: b"val11".to_vec(),
-                ..Default::default()
-            })
-            .await
-            .is_ok());
+        prepare_kv(&kv_store).await;

        kv_store
    }

-    fn mock_kvs() -> Vec<KeyValue> {
-        vec![
-            KeyValue {
-                key: b"key1".to_vec(),
-                value: b"val1".to_vec(),
-            },
-            KeyValue {
-                key: b"key2".to_vec(),
-                value: b"val2".to_vec(),
-            },
-            KeyValue {
-                key: b"key3".to_vec(),
-                value: b"val3".to_vec(),
-            },
-        ]
-    }
-
    #[tokio::test]
    async fn test_put() {
        let kv_store = mock_mem_store_with_data().await;

-        let resp = kv_store
-            .put(PutRequest {
-                key: b"key11".to_vec(),
-                value: b"val12".to_vec(),
-                prev_kv: false,
-            })
-            .await
-            .unwrap();
-        assert!(resp.prev_kv.is_none());
-
-        let resp = kv_store
-            .put(PutRequest {
-                key: b"key11".to_vec(),
-                value: b"val13".to_vec(),
-                prev_kv: true,
-            })
-            .await
-            .unwrap();
-        let prev_kv = resp.prev_kv.unwrap();
-        assert_eq!(b"key11", prev_kv.key());
-        assert_eq!(b"val12", prev_kv.value());
+        test_kv_put(kv_store).await;
    }

    #[tokio::test]
    async fn test_range() {
        let kv_store = mock_mem_store_with_data().await;

-        let key = b"key1".to_vec();
-        let range_end = util::get_prefix_end_key(b"key1");
+        test_kv_range(kv_store).await;
+    }

-        let resp = kv_store
-            .range(RangeRequest {
-                key: key.clone(),
-                range_end: range_end.clone(),
-                limit: 0,
-                keys_only: false,
-            })
-            .await
-            .unwrap();
+    #[tokio::test]
+    async fn test_range_2() {
+        let kv = MemoryKvBackend::<Error>::new();

-        assert_eq!(2, resp.kvs.len());
-        assert_eq!(b"key1", resp.kvs[0].key());
-        assert_eq!(b"val1", resp.kvs[0].value());
-        assert_eq!(b"key11", resp.kvs[1].key());
-        assert_eq!(b"val11", resp.kvs[1].value());
-
-        let resp = kv_store
-            .range(RangeRequest {
-                key: key.clone(),
-                range_end: range_end.clone(),
-                limit: 0,
-                keys_only: true,
-            })
-            .await
-            .unwrap();
-
-        assert_eq!(2, resp.kvs.len());
-        assert_eq!(b"key1", resp.kvs[0].key());
-        assert_eq!(b"", resp.kvs[0].value());
-        assert_eq!(b"key11", resp.kvs[1].key());
-        assert_eq!(b"", resp.kvs[1].value());
-
-        let resp = kv_store
-            .range(RangeRequest {
-                key: key.clone(),
-                limit: 0,
-                keys_only: false,
-                ..Default::default()
-            })
-            .await
-            .unwrap();
-
-        assert_eq!(1, resp.kvs.len());
-        assert_eq!(b"key1", resp.kvs[0].key());
-        assert_eq!(b"val1", resp.kvs[0].value());
-
-        let resp = kv_store
-            .range(RangeRequest {
-                key,
-                range_end,
-                limit: 1,
-                keys_only: false,
-            })
-            .await
-            .unwrap();
-
-        assert_eq!(1, resp.kvs.len());
-        assert_eq!(b"key1", resp.kvs[0].key());
-        assert_eq!(b"val1", resp.kvs[0].value());
+        test_kv_range_2(kv).await;
    }

    #[tokio::test]
    async fn test_batch_get() {
        let kv_store = mock_mem_store_with_data().await;

-        let keys = vec![];
-        let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap();
-
-        assert!(resp.kvs.is_empty());
-
-        let keys = vec![b"key10".to_vec()];
-        let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap();
-
-        assert!(resp.kvs.is_empty());
-
-        let keys = vec![b"key1".to_vec(), b"key3".to_vec(), b"key4".to_vec()];
-        let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap();
-
-        assert_eq!(2, resp.kvs.len());
-        assert_eq!(b"key1", resp.kvs[0].key());
-        assert_eq!(b"val1", resp.kvs[0].value());
-        assert_eq!(b"key3", resp.kvs[1].key());
-        assert_eq!(b"val3", resp.kvs[1].value());
+        test_kv_batch_get(kv_store).await;
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn test_compare_and_put() {
        let kv_store = Arc::new(MemoryKvBackend::<Error>::new());
-        let success = Arc::new(AtomicU8::new(0));

-        let mut joins = vec![];
-        for _ in 0..20 {
-            let kv_store_clone = kv_store.clone();
-            let success_clone = success.clone();
-            let join = tokio::spawn(async move {
-                let req = CompareAndPutRequest {
-                    key: b"key".to_vec(),
-                    expect: vec![],
-                    value: b"val_new".to_vec(),
-                };
-                let resp = kv_store_clone.compare_and_put(req).await.unwrap();
-                if resp.success {
-                    success_clone.fetch_add(1, Ordering::SeqCst);
-                }
-            });
-            joins.push(join);
-        }
-
-        for join in joins {
-            join.await.unwrap();
-        }
-
-        assert_eq!(1, success.load(Ordering::SeqCst));
+        test_kv_compare_and_put(kv_store).await;
    }

    #[tokio::test]
    async fn test_delete_range() {
        let kv_store = mock_mem_store_with_data().await;

-        let req = DeleteRangeRequest {
-            key: b"key3".to_vec(),
-            range_end: vec![],
-            prev_kv: true,
-        };
-
-        let resp = kv_store.delete_range(req).await.unwrap();
-        assert_eq!(1, resp.prev_kvs.len());
-        assert_eq!(b"key3", resp.prev_kvs[0].key());
-        assert_eq!(b"val3", resp.prev_kvs[0].value());
-
-        let resp = kv_store.get(b"key3").await.unwrap();
-        assert!(resp.is_none());
-
-        let req = DeleteRangeRequest {
-            key: b"key2".to_vec(),
-            range_end: vec![],
-            prev_kv: false,
-        };
-
-        let resp = kv_store.delete_range(req).await.unwrap();
-        assert!(resp.prev_kvs.is_empty());
-
-        let resp = kv_store.get(b"key2").await.unwrap();
-        assert!(resp.is_none());
-
-        let key = b"key1".to_vec();
-        let range_end = util::get_prefix_end_key(b"key1");
-
-        let req = DeleteRangeRequest {
-            key: key.clone(),
-            range_end: range_end.clone(),
-            prev_kv: true,
-        };
-        let resp = kv_store.delete_range(req).await.unwrap();
-        assert_eq!(2, resp.prev_kvs.len());
-
-        let req = RangeRequest {
-            key,
-            range_end,
-            ..Default::default()
-        };
-        let resp = kv_store.range(req).await.unwrap();
-        assert!(resp.kvs.is_empty());
+        test_kv_delete_range(kv_store).await;
    }

    #[tokio::test]
@@ -636,35 +437,6 @@ mod tests {
    async fn test_batch_delete() {
        let kv_store = mock_mem_store_with_data().await;

-        assert!(kv_store.get(b"key1").await.unwrap().is_some());
-        assert!(kv_store.get(b"key100").await.unwrap().is_none());
-
-        let req = BatchDeleteRequest {
-            keys: vec![b"key1".to_vec(), b"key100".to_vec()],
-            prev_kv: true,
-        };
-        let resp = kv_store.batch_delete(req).await.unwrap();
-        assert_eq!(1, resp.prev_kvs.len());
-        assert_eq!(
-            vec![KeyValue {
-                key: b"key1".to_vec(),
-                value: b"val1".to_vec()
-            }],
-            resp.prev_kvs
-        );
-        assert!(kv_store.get(b"key1").await.unwrap().is_none());
-
-        assert!(kv_store.get(b"key2").await.unwrap().is_some());
-        assert!(kv_store.get(b"key3").await.unwrap().is_some());
-
-        let req = BatchDeleteRequest {
-            keys: vec![b"key2".to_vec(), b"key3".to_vec()],
-            prev_kv: false,
-        };
-        let resp = kv_store.batch_delete(req).await.unwrap();
-        assert!(resp.prev_kvs.is_empty());
-
-        assert!(kv_store.get(b"key2").await.unwrap().is_none());
-        assert!(kv_store.get(b"key3").await.unwrap().is_none());
+        test_kv_batch_delete(kv_store).await;
    }
 }
--- a/src/common/meta/src/kv_backend/test.rs
+++ b/src/common/meta/src/kv_backend/test.rs
@@ -0,0 +1,352 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::atomic::{AtomicU8, Ordering};
+use std::sync::Arc;
+
+use super::{KvBackend, *};
+use crate::error::Error;
+use crate::rpc::store::{BatchGetRequest, PutRequest};
+use crate::rpc::KeyValue;
+use crate::util;
+
+pub fn mock_kvs() -> Vec<KeyValue> {
+    vec![
+        KeyValue {
+            key: b"key1".to_vec(),
+            value: b"val1".to_vec(),
+        },
+        KeyValue {
+            key: b"key2".to_vec(),
+            value: b"val2".to_vec(),
+        },
+        KeyValue {
+            key: b"key3".to_vec(),
+            value: b"val3".to_vec(),
+        },
+    ]
+}
+
+pub async fn prepare_kv(kv_store: &impl KvBackend) {
+    let kvs = mock_kvs();
+    assert!(kv_store
+        .batch_put(BatchPutRequest {
+            kvs,
+            ..Default::default()
+        })
+        .await
+        .is_ok());
+
+    assert!(kv_store
+        .put(PutRequest {
+            key: b"key11".to_vec(),
+            value: b"val11".to_vec(),
+            ..Default::default()
+        })
+        .await
+        .is_ok());
+}
+
+pub async fn test_kv_put(kv_store: impl KvBackend) {
+    let resp = kv_store
+        .put(PutRequest {
+            key: b"key11".to_vec(),
+            value: b"val12".to_vec(),
+            prev_kv: false,
+        })
+        .await
+        .unwrap();
+    assert!(resp.prev_kv.is_none());
+
+    let resp = kv_store
+        .put(PutRequest {
+            key: b"key11".to_vec(),
+            value: b"val13".to_vec(),
+            prev_kv: true,
+        })
+        .await
+        .unwrap();
+    let prev_kv = resp.prev_kv.unwrap();
+    assert_eq!(b"key11", prev_kv.key());
+    assert_eq!(b"val12", prev_kv.value());
+}
+
+pub async fn test_kv_range(kv_store: impl KvBackend) {
+    let key = b"key1".to_vec();
+    let range_end = util::get_prefix_end_key(b"key1");
+
+    let resp = kv_store
+        .range(RangeRequest {
+            key: key.clone(),
+            range_end: range_end.clone(),
+            limit: 0,
+            keys_only: false,
+        })
+        .await
+        .unwrap();
+
+    assert_eq!(2, resp.kvs.len());
+    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(b"val1", resp.kvs[0].value());
+    assert_eq!(b"key11", resp.kvs[1].key());
+    assert_eq!(b"val11", resp.kvs[1].value());
+
+    let resp = kv_store
+        .range(RangeRequest {
+            key: key.clone(),
+            range_end: range_end.clone(),
+            limit: 0,
+            keys_only: true,
+        })
+        .await
+        .unwrap();
+
+    assert_eq!(2, resp.kvs.len());
+    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(b"", resp.kvs[0].value());
+    assert_eq!(b"key11", resp.kvs[1].key());
+    assert_eq!(b"", resp.kvs[1].value());
+
+    let resp = kv_store
+        .range(RangeRequest {
+            key: key.clone(),
+            limit: 0,
+            keys_only: false,
+            ..Default::default()
+        })
+        .await
+        .unwrap();
+
+    assert_eq!(1, resp.kvs.len());
+    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(b"val1", resp.kvs[0].value());
+
+    let resp = kv_store
+        .range(RangeRequest {
+            key,
+            range_end,
+            limit: 1,
+            keys_only: false,
+        })
+        .await
+        .unwrap();
+
+    assert_eq!(1, resp.kvs.len());
+    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(b"val1", resp.kvs[0].value());
+}
+
+pub async fn test_kv_range_2(kv_store: impl KvBackend) {
+    kv_store
+        .put(PutRequest::new().with_key("atest").with_value("value"))
+        .await
+        .unwrap();
+
+    kv_store
+        .put(PutRequest::new().with_key("test").with_value("value"))
+        .await
+        .unwrap();
+
+    // If both key and range_end are ‘\0’, then range represents all keys.
+    let result = kv_store
+        .range(RangeRequest::new().with_range(b"\0".to_vec(), b"\0".to_vec()))
+        .await
+        .unwrap();
+
+    assert_eq!(result.kvs.len(), 2);
+    assert!(!result.more);
+
+    // If range_end is ‘\0’, the range is all keys greater than or equal to the key argument.
+    let result = kv_store
+        .range(RangeRequest::new().with_range(b"a".to_vec(), b"\0".to_vec()))
+        .await
+        .unwrap();
+
+    assert_eq!(result.kvs.len(), 2);
+
+    let result = kv_store
+        .range(RangeRequest::new().with_range(b"b".to_vec(), b"\0".to_vec()))
+        .await
+        .unwrap();
+
+    assert_eq!(result.kvs.len(), 1);
+    assert_eq!(result.kvs[0].key, b"test");
+
+    // Fetches the keys >= "a", set limit to 1, the `more` should be true.
+    let result = kv_store
+        .range(
+            RangeRequest::new()
+                .with_range(b"a".to_vec(), b"\0".to_vec())
+                .with_limit(1),
+        )
+        .await
+        .unwrap();
+    assert_eq!(result.kvs.len(), 1);
+    assert!(result.more);
+
+    // Fetches the keys >= "a", set limit to 2, the `more` should be false.
+    let result = kv_store
+        .range(
+            RangeRequest::new()
+                .with_range(b"a".to_vec(), b"\0".to_vec())
+                .with_limit(2),
+        )
+        .await
+        .unwrap();
+    assert_eq!(result.kvs.len(), 2);
+    assert!(!result.more);
+
+    // Fetches the keys >= "a", set limit to 3, the `more` should be false.
+    let result = kv_store
+        .range(
+            RangeRequest::new()
+                .with_range(b"a".to_vec(), b"\0".to_vec())
+                .with_limit(3),
+        )
+        .await
+        .unwrap();
+    assert_eq!(result.kvs.len(), 2);
+    assert!(!result.more);
+}
+
+pub async fn test_kv_batch_get(kv_store: impl KvBackend) {
+    let keys = vec![];
+    let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap();
+
+    assert!(resp.kvs.is_empty());
+
+    let keys = vec![b"key10".to_vec()];
+    let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap();
+
+    assert!(resp.kvs.is_empty());
+
+    let keys = vec![b"key1".to_vec(), b"key3".to_vec(), b"key4".to_vec()];
+    let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap();
+
+    assert_eq!(2, resp.kvs.len());
+    assert_eq!(b"key1", resp.kvs[0].key());
+    assert_eq!(b"val1", resp.kvs[0].value());
+    assert_eq!(b"key3", resp.kvs[1].key());
+    assert_eq!(b"val3", resp.kvs[1].value());
+}
+
+pub async fn test_kv_compare_and_put(kv_store: Arc<dyn KvBackend<Error = Error>>) {
+    let success = Arc::new(AtomicU8::new(0));
+
+    let mut joins = vec![];
+    for _ in 0..20 {
+        let kv_store_clone = kv_store.clone();
+        let success_clone = success.clone();
+        let join = tokio::spawn(async move {
+            let req = CompareAndPutRequest {
+                key: b"key".to_vec(),
+                expect: vec![],
+                value: b"val_new".to_vec(),
+            };
+            let resp = kv_store_clone.compare_and_put(req).await.unwrap();
+            if resp.success {
+                success_clone.fetch_add(1, Ordering::SeqCst);
+            }
+        });
+        joins.push(join);
+    }
+
+    for join in joins {
+        join.await.unwrap();
+    }
+
+    assert_eq!(1, success.load(Ordering::SeqCst));
+}
+
+pub async fn test_kv_delete_range(kv_store: impl KvBackend) {
+    let req = DeleteRangeRequest {
+        key: b"key3".to_vec(),
+        range_end: vec![],
+        prev_kv: true,
+    };
+
+    let resp = kv_store.delete_range(req).await.unwrap();
+    assert_eq!(1, resp.prev_kvs.len());
+    assert_eq!(1, resp.deleted);
+    assert_eq!(b"key3", resp.prev_kvs[0].key());
+    assert_eq!(b"val3", resp.prev_kvs[0].value());
+
+    let resp = kv_store.get(b"key3").await.unwrap();
+    assert!(resp.is_none());
+
+    let req = DeleteRangeRequest {
+        key: b"key2".to_vec(),
+        range_end: vec![],
+        prev_kv: false,
+    };
+
+    let resp = kv_store.delete_range(req).await.unwrap();
+    assert_eq!(1, resp.deleted);
+    assert!(resp.prev_kvs.is_empty());
+
+    let resp = kv_store.get(b"key2").await.unwrap();
+    assert!(resp.is_none());
+
+    let key = b"key1".to_vec();
+    let range_end = util::get_prefix_end_key(b"key1");
+
+    let req = DeleteRangeRequest {
+        key: key.clone(),
+        range_end: range_end.clone(),
+        prev_kv: true,
+    };
+    let resp = kv_store.delete_range(req).await.unwrap();
+    assert_eq!(2, resp.prev_kvs.len());
+
+    let req = RangeRequest {
+        key,
+        range_end,
+        ..Default::default()
+    };
+    let resp = kv_store.range(req).await.unwrap();
+    assert!(resp.kvs.is_empty());
+}
+
+pub async fn test_kv_batch_delete(kv_store: impl KvBackend) {
+    assert!(kv_store.get(b"key1").await.unwrap().is_some());
+    assert!(kv_store.get(b"key100").await.unwrap().is_none());
+
+    let req = BatchDeleteRequest {
+        keys: vec![b"key1".to_vec(), b"key100".to_vec()],
+        prev_kv: true,
+    };
+    let resp = kv_store.batch_delete(req).await.unwrap();
+    assert_eq!(1, resp.prev_kvs.len());
+    assert_eq!(
+        vec![KeyValue {
+            key: b"key1".to_vec(),
+            value: b"val1".to_vec()
+        }],
+        resp.prev_kvs
+    );
+    assert!(kv_store.get(b"key1").await.unwrap().is_none());
+
+    assert!(kv_store.get(b"key2").await.unwrap().is_some());
+    assert!(kv_store.get(b"key3").await.unwrap().is_some());
+
+    let req = BatchDeleteRequest {
+        keys: vec![b"key2".to_vec(), b"key3".to_vec()],
+        prev_kv: false,
+    };
+    let resp = kv_store.batch_delete(req).await.unwrap();
+    assert!(resp.prev_kvs.is_empty());
+
+    assert!(kv_store.get(b"key2").await.unwrap().is_none());
+    assert!(kv_store.get(b"key3").await.unwrap().is_none());
+}
--- a/src/common/meta/src/rpc/store.rs
+++ b/src/common/meta/src/rpc/store.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::fmt::{Display, Formatter};
+use std::ops::Bound;

 use api::v1::meta::{
    BatchDeleteRequest as PbBatchDeleteRequest, BatchDeleteResponse as PbBatchDeleteResponse,
@@ -30,6 +31,17 @@ use crate::error;
 use crate::error::Result;
 use crate::rpc::{util, KeyValue};

+pub fn to_range(key: Vec<u8>, range_end: Vec<u8>) -> (Bound<Vec<u8>>, Bound<Vec<u8>>) {
+    match (&key[..], &range_end[..]) {
+        (_, []) => (Bound::Included(key.clone()), Bound::Included(key)),
+        // If both key and range_end are ‘\0’, then range represents all keys.
+        ([0], [0]) => (Bound::Unbounded, Bound::Unbounded),
+        // If range_end is ‘\0’, the range is all keys greater than or equal to the key argument.
+        (_, [0]) => (Bound::Included(key), Bound::Unbounded),
+        (_, _) => (Bound::Included(key), Bound::Excluded(range_end)),
+    }
+}
+
 #[derive(Debug, Clone, Default)]
 pub struct RangeRequest {
    /// key is the first key for the range, If range_end is not given, the
@@ -96,6 +108,11 @@ impl RangeRequest {
        }
    }

+    /// Returns the `RangeBounds`.
+    pub fn range(&self) -> (Bound<Vec<u8>>, Bound<Vec<u8>>) {
+        to_range(self.key.clone(), self.range_end.clone())
+    }
+
    /// key is the first key for the range, If range_end is not given, the
    /// request only looks up key.
    #[inline]
@@ -690,6 +707,11 @@ impl DeleteRangeRequest {
        }
    }

+    /// Returns the `RangeBounds`.
+    pub fn range(&self) -> (Bound<Vec<u8>>, Bound<Vec<u8>>) {
+        to_range(self.key.clone(), self.range_end.clone())
+    }
+
    /// key is the first key to delete in the range. If range_end is not given,
    /// the range is defined to contain only the key argument.
    #[inline]
--- a/src/common/procedure/src/error.rs
+++ b/src/common/procedure/src/error.rs
@@ -34,6 +34,9 @@ pub enum Error {
    #[snafu(display("Loader {} is already registered", name))]
    LoaderConflict { name: String, location: Location },

+    #[snafu(display("Procedure Manager is stopped"))]
+    ManagerNotStart { location: Location },
+
    #[snafu(display("Failed to serialize to json"))]
    ToJson {
        #[snafu(source)]
@@ -148,7 +151,8 @@ impl ErrorExt for Error {
            | Error::FromJson { .. }
            | Error::RetryTimesExceeded { .. }
            | Error::RetryLater { .. }
-            | Error::WaitWatcher { .. } => StatusCode::Internal,
+            | Error::WaitWatcher { .. }
+            | Error::ManagerNotStart { .. } => StatusCode::Internal,
            Error::LoaderConflict { .. } | Error::DuplicateProcedure { .. } => {
                StatusCode::InvalidArguments
            }
--- a/src/common/procedure/src/lib.rs
+++ b/src/common/procedure/src/lib.rs
@@ -14,6 +14,8 @@

 //! Common traits and structures for the procedure framework.

+#![feature(assert_matches)]
+
 pub mod error;
 pub mod local;
 pub mod options;
--- a/src/common/procedure/src/local.rs
+++ b/src/common/procedure/src/local.rs
@@ -16,20 +16,21 @@ mod lock;
 mod runner;

 use std::collections::{HashMap, VecDeque};
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex, RwLock};
 use std::time::{Duration, Instant};

 use async_trait::async_trait;
 use backon::ExponentialBuilder;
 use common_runtime::{RepeatedTask, TaskFunction};
-use common_telemetry::logging;
+use common_telemetry::{info, logging};
 use snafu::{ensure, ResultExt};
 use tokio::sync::watch::{self, Receiver, Sender};
-use tokio::sync::Notify;
+use tokio::sync::{Mutex as TokioMutex, Notify};

 use crate::error::{
-    DuplicateProcedureSnafu, Error, LoaderConflictSnafu, Result, StartRemoveOutdatedMetaTaskSnafu,
-    StopRemoveOutdatedMetaTaskSnafu,
+    DuplicateProcedureSnafu, Error, LoaderConflictSnafu, ManagerNotStartSnafu, Result,
+    StartRemoveOutdatedMetaTaskSnafu, StopRemoveOutdatedMetaTaskSnafu,
 };
 use crate::local::lock::LockMap;
 use crate::local::runner::Runner;
@@ -135,6 +136,8 @@ pub(crate) struct ManagerContext {
    messages: Mutex<HashMap<ProcedureId, ProcedureMessage>>,
    /// Ids and finished time of finished procedures.
    finished_procedures: Mutex<VecDeque<(ProcedureId, Instant)>>,
+    /// Running flag.
+    running: Arc<AtomicBool>,
 }

 #[async_trait]
@@ -153,9 +156,29 @@ impl ManagerContext {
            procedures: RwLock::new(HashMap::new()),
            messages: Mutex::new(HashMap::new()),
            finished_procedures: Mutex::new(VecDeque::new()),
+            running: Arc::new(AtomicBool::new(false)),
        }
    }

+    #[cfg(test)]
+    pub(crate) fn set_running(&self) {
+        self.running.store(true, Ordering::Relaxed);
+    }
+
+    /// Set the running flag.
+    pub(crate) fn start(&self) {
+        self.running.store(true, Ordering::Relaxed);
+    }
+
+    pub(crate) fn stop(&self) {
+        self.running.store(false, Ordering::Relaxed);
+    }
+
+    /// Return `ProcedureManager` is running.
+    pub(crate) fn running(&self) -> bool {
+        self.running.load(Ordering::Relaxed)
+    }
+
    /// Returns true if the procedure with specific `procedure_id` exists.
    fn contains_procedure(&self, procedure_id: ProcedureId) -> bool {
        let procedures = self.procedures.read().unwrap();
@@ -368,29 +391,37 @@ pub struct LocalManager {
    procedure_store: Arc<ProcedureStore>,
    max_retry_times: usize,
    retry_delay: Duration,
-    remove_outdated_meta_task: RepeatedTask<Error>,
+    /// GC task.
+    remove_outdated_meta_task: TokioMutex<Option<RepeatedTask<Error>>>,
+    config: ManagerConfig,
 }

 impl LocalManager {
    /// Create a new [LocalManager] with specific `config`.
    pub fn new(config: ManagerConfig, state_store: StateStoreRef) -> LocalManager {
        let manager_ctx = Arc::new(ManagerContext::new());
-        let remove_outdated_meta_task = RepeatedTask::new(
-            config.remove_outdated_meta_task_interval,
-            Box::new(RemoveOutdatedMetaFunction {
-                manager_ctx: manager_ctx.clone(),
-                ttl: config.remove_outdated_meta_ttl,
-            }),
-        );
+
        LocalManager {
            manager_ctx,
            procedure_store: Arc::new(ProcedureStore::new(&config.parent_path, state_store)),
            max_retry_times: config.max_retry_times,
            retry_delay: config.retry_delay,
-            remove_outdated_meta_task,
+            remove_outdated_meta_task: TokioMutex::new(None),
+            config,
        }
    }

+    /// Build remove outedated meta task
+    pub fn build_remove_outdated_meta_task(&self) -> RepeatedTask<Error> {
+        RepeatedTask::new(
+            self.config.remove_outdated_meta_task_interval,
+            Box::new(RemoveOutdatedMetaFunction {
+                manager_ctx: self.manager_ctx.clone(),
+                ttl: self.config.remove_outdated_meta_ttl,
+            }),
+        )
+    }
+
    /// Submit a root procedure with given `procedure_id`.
    fn submit_root(
        &self,
@@ -398,6 +429,8 @@ impl LocalManager {
        step: u32,
        procedure: BoxedProcedure,
    ) -> Result<Watcher> {
+        ensure!(self.manager_ctx.running(), ManagerNotStartSnafu);
+
        let meta = Arc::new(ProcedureMeta::new(procedure_id, None, procedure.lock_key()));
        let runner = Runner {
            meta: meta.clone(),
@@ -426,44 +459,8 @@ impl LocalManager {

        Ok(watcher)
    }
-}
-
-#[async_trait]
-impl ProcedureManager for LocalManager {
-    fn register_loader(&self, name: &str, loader: BoxedProcedureLoader) -> Result<()> {
-        let mut loaders = self.manager_ctx.loaders.lock().unwrap();
-        ensure!(!loaders.contains_key(name), LoaderConflictSnafu { name });
-
-        let _ = loaders.insert(name.to_string(), loader);
-
-        Ok(())
-    }
-
-    fn start(&self) -> Result<()> {
-        self.remove_outdated_meta_task
-            .start(common_runtime::bg_runtime())
-            .context(StartRemoveOutdatedMetaTaskSnafu)?;
-        Ok(())
-    }
-
-    async fn stop(&self) -> Result<()> {
-        self.remove_outdated_meta_task
-            .stop()
-            .await
-            .context(StopRemoveOutdatedMetaTaskSnafu)?;
-        Ok(())
-    }
-
-    async fn submit(&self, procedure: ProcedureWithId) -> Result<Watcher> {
-        let procedure_id = procedure.id;
-        ensure!(
-            !self.manager_ctx.contains_procedure(procedure_id),
-            DuplicateProcedureSnafu { procedure_id }
-        );
-
-        self.submit_root(procedure.id, 0, procedure.procedure)
-    }

+    /// Recovers unfinished procedures and reruns them.
    async fn recover(&self) -> Result<()> {
        logging::info!("LocalManager start to recover");
        let recover_start = Instant::now();
@@ -519,6 +516,64 @@ impl ProcedureManager for LocalManager {

        Ok(())
    }
+}
+
+#[async_trait]
+impl ProcedureManager for LocalManager {
+    fn register_loader(&self, name: &str, loader: BoxedProcedureLoader) -> Result<()> {
+        let mut loaders = self.manager_ctx.loaders.lock().unwrap();
+        ensure!(!loaders.contains_key(name), LoaderConflictSnafu { name });
+
+        let _ = loaders.insert(name.to_string(), loader);
+
+        Ok(())
+    }
+
+    async fn start(&self) -> Result<()> {
+        let mut task = self.remove_outdated_meta_task.lock().await;
+
+        if task.is_some() {
+            return Ok(());
+        }
+
+        let task_inner = self.build_remove_outdated_meta_task();
+
+        task_inner
+            .start(common_runtime::bg_runtime())
+            .context(StartRemoveOutdatedMetaTaskSnafu)?;
+
+        *task = Some(task_inner);
+
+        self.manager_ctx.start();
+
+        info!("LocalManager is start.");
+
+        self.recover().await
+    }
+
+    async fn stop(&self) -> Result<()> {
+        let mut task = self.remove_outdated_meta_task.lock().await;
+
+        if let Some(task) = task.take() {
+            task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)?;
+        }
+
+        self.manager_ctx.stop();
+
+        info!("LocalManager is stopped.");
+
+        Ok(())
+    }
+
+    async fn submit(&self, procedure: ProcedureWithId) -> Result<Watcher> {
+        let procedure_id = procedure.id;
+        ensure!(
+            !self.manager_ctx.contains_procedure(procedure_id),
+            DuplicateProcedureSnafu { procedure_id }
+        );
+
+        self.submit_root(procedure.id, 0, procedure.procedure)
+    }

    async fn procedure_state(&self, procedure_id: ProcedureId) -> Result<Option<ProcedureState>> {
        Ok(self.manager_ctx.state(procedure_id))
@@ -569,12 +624,14 @@ pub(crate) mod test_util {

 #[cfg(test)]
 mod tests {
+    use std::assert_matches::assert_matches;
+
    use common_error::mock::MockError;
    use common_error::status_code::StatusCode;
    use common_test_util::temp_dir::create_temp_dir;

    use super::*;
-    use crate::error::Error;
+    use crate::error::{self, Error};
    use crate::store::state_store::ObjectStateStore;
    use crate::{Context, Procedure, Status};

@@ -691,6 +748,7 @@ mod tests {
        };
        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
        let manager = LocalManager::new(config, state_store);
+        manager.manager_ctx.start();

        manager
            .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
@@ -714,6 +772,7 @@ mod tests {
        };
        let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
        let manager = LocalManager::new(config, state_store);
+        manager.manager_ctx.start();

        manager
            .register_loader("ProcedureToLoad", ProcedureToLoad::loader())
@@ -762,6 +821,7 @@ mod tests {
        };
        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
        let manager = LocalManager::new(config, state_store);
+        manager.manager_ctx.start();

        let procedure_id = ProcedureId::random();
        assert!(manager
@@ -812,6 +872,7 @@ mod tests {
        };
        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
        let manager = LocalManager::new(config, state_store);
+        manager.manager_ctx.start();

        #[derive(Debug)]
        struct MockProcedure {
@@ -864,6 +925,66 @@ mod tests {
    }

    #[tokio::test]
+    async fn test_procedure_manager_stopped() {
+        let dir = create_temp_dir("procedure_manager_stopped");
+        let config = ManagerConfig {
+            parent_path: "data/".to_string(),
+            max_retry_times: 3,
+            retry_delay: Duration::from_millis(500),
+            ..Default::default()
+        };
+        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
+        let manager = LocalManager::new(config, state_store);
+
+        let mut procedure = ProcedureToLoad::new("submit");
+        procedure.lock_key = LockKey::single("test.submit");
+        let procedure_id = ProcedureId::random();
+        assert_matches!(
+            manager
+                .submit(ProcedureWithId {
+                    id: procedure_id,
+                    procedure: Box::new(procedure),
+                })
+                .await
+                .unwrap_err(),
+            error::Error::ManagerNotStart { .. }
+        );
+    }
+
+    #[tokio::test]
+    async fn test_procedure_manager_restart() {
+        let dir = create_temp_dir("procedure_manager_restart");
+        let config = ManagerConfig {
+            parent_path: "data/".to_string(),
+            max_retry_times: 3,
+            retry_delay: Duration::from_millis(500),
+            ..Default::default()
+        };
+        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
+        let manager = LocalManager::new(config, state_store);
+
+        manager.start().await.unwrap();
+        manager.stop().await.unwrap();
+        manager.start().await.unwrap();
+
+        let mut procedure = ProcedureToLoad::new("submit");
+        procedure.lock_key = LockKey::single("test.submit");
+        let procedure_id = ProcedureId::random();
+        assert!(manager
+            .submit(ProcedureWithId {
+                id: procedure_id,
+                procedure: Box::new(procedure),
+            })
+            .await
+            .is_ok());
+        assert!(manager
+            .procedure_state(procedure_id)
+            .await
+            .unwrap()
+            .is_some());
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
    async fn test_remove_outdated_meta_task() {
        let dir = create_temp_dir("remove_outdated_meta_task");
        let object_store = test_util::new_object_store(&dir);
@@ -876,6 +997,7 @@ mod tests {
        };
        let state_store = Arc::new(ObjectStateStore::new(object_store.clone()));
        let manager = LocalManager::new(config, state_store);
+        manager.manager_ctx.set_running();

        let mut procedure = ProcedureToLoad::new("submit");
        procedure.lock_key = LockKey::single("test.submit");
@@ -889,8 +1011,9 @@ mod tests {
            .is_ok());
        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
        watcher.changed().await.unwrap();
-        manager.start().unwrap();
-        tokio::time::sleep(Duration::from_millis(10)).await;
+
+        manager.start().await.unwrap();
+        tokio::time::sleep(Duration::from_millis(300)).await;
        assert!(manager
            .procedure_state(procedure_id)
            .await
@@ -902,6 +1025,8 @@ mod tests {
        let mut procedure = ProcedureToLoad::new("submit");
        procedure.lock_key = LockKey::single("test.submit");
        let procedure_id = ProcedureId::random();
+
+        manager.manager_ctx.set_running();
        assert!(manager
            .submit(ProcedureWithId {
                id: procedure_id,
@@ -911,11 +1036,33 @@ mod tests {
            .is_ok());
        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
        watcher.changed().await.unwrap();
-        tokio::time::sleep(Duration::from_millis(10)).await;
+        tokio::time::sleep(Duration::from_millis(300)).await;
        assert!(manager
            .procedure_state(procedure_id)
            .await
            .unwrap()
            .is_some());
+
+        // After restart
+        let mut procedure = ProcedureToLoad::new("submit");
+        procedure.lock_key = LockKey::single("test.submit");
+        let procedure_id = ProcedureId::random();
+        assert!(manager
+            .submit(ProcedureWithId {
+                id: procedure_id,
+                procedure: Box::new(procedure),
+            })
+            .await
+            .is_ok());
+        let mut watcher = manager.procedure_watcher(procedure_id).unwrap();
+        watcher.changed().await.unwrap();
+
+        manager.start().await.unwrap();
+        tokio::time::sleep(Duration::from_millis(300)).await;
+        assert!(manager
+            .procedure_state(procedure_id)
+            .await
+            .unwrap()
+            .is_none());
    }
 }
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -19,7 +19,7 @@ use backon::{BackoffBuilder, ExponentialBuilder};
 use common_telemetry::logging;
 use tokio::time;

-use crate::error::{ProcedurePanicSnafu, Result};
+use crate::error::{self, ProcedurePanicSnafu, Result};
 use crate::local::{ManagerContext, ProcedureMeta, ProcedureMetaRef};
 use crate::store::ProcedureStore;
 use crate::ProcedureState::Retrying;
@@ -102,7 +102,6 @@ impl Drop for ProcedureGuard {
    }
 }

-// TODO(yingwen): Support cancellation.
 pub(crate) struct Runner {
    pub(crate) meta: ProcedureMetaRef,
    pub(crate) procedure: BoxedProcedure,
@@ -114,6 +113,11 @@ pub(crate) struct Runner {
 }

 impl Runner {
+    /// Return `ProcedureManager` is running.
+    pub(crate) fn running(&self) -> bool {
+        self.manager_ctx.running()
+    }
+
    /// Run the procedure.
    pub(crate) async fn run(mut self) {
        // Ensure we can update the procedure state.
@@ -152,6 +156,12 @@ impl Runner {
            let procedure_ids = self.manager_ctx.procedures_in_tree(&self.meta);
            // Clean resources.
            self.manager_ctx.on_procedures_finish(&procedure_ids);
+
+            // If `ProcedureManager` is stopped, it stops the current task immediately without deleting the procedure.
+            if !self.running() {
+                return;
+            }
+
            for id in procedure_ids {
                if let Err(e) = self.store.delete_procedure(id).await {
                    logging::error!(
@@ -186,6 +196,13 @@ impl Runner {
        let mut retry = self.exponential_builder.build();
        let mut retry_times = 0;
        loop {
+            // Don't store state if `ProcedureManager` is stopped.
+            if !self.running() {
+                self.meta.set_state(ProcedureState::Failed {
+                    error: Arc::new(error::ManagerNotStartSnafu {}.build()),
+                });
+                return;
+            }
            match self.execute_once(ctx).await {
                ExecResult::Done | ExecResult::Failed => return,
                ExecResult::Continue => (),
@@ -238,6 +255,14 @@ impl Runner {
                    status.need_persist(),
                );

+                // Don't store state if `ProcedureManager` is stopped.
+                if !self.running() {
+                    self.meta.set_state(ProcedureState::Failed {
+                        error: Arc::new(error::ManagerNotStartSnafu {}.build()),
+                    });
+                    return ExecResult::Failed;
+                }
+
                if status.need_persist() {
                    if let Err(err) = self.persist_procedure().await {
                        self.meta.set_state(ProcedureState::retrying(Arc::new(err)));
@@ -272,6 +297,14 @@ impl Runner {
                    e.is_retry_later(),
                );

+                // Don't store state if `ProcedureManager` is stopped.
+                if !self.running() {
+                    self.meta.set_state(ProcedureState::Failed {
+                        error: Arc::new(error::ManagerNotStartSnafu {}.build()),
+                    });
+                    return ExecResult::Failed;
+                }
+
                if e.is_retry_later() {
                    self.meta.set_state(ProcedureState::retrying(Arc::new(e)));
                    return ExecResult::RetryLater;
@@ -581,6 +614,7 @@ mod tests {
        let object_store = test_util::new_object_store(&dir);
        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
+        runner.manager_ctx.start();

        let res = runner.execute_once(&ctx).await;
        assert!(res.is_continue(), "{res:?}");
@@ -641,6 +675,7 @@ mod tests {
        let object_store = test_util::new_object_store(&dir);
        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
        let mut runner = new_runner(meta, Box::new(suspend), procedure_store);
+        runner.manager_ctx.start();

        let res = runner.execute_once(&ctx).await;
        assert!(res.is_continue(), "{res:?}");
@@ -742,6 +777,7 @@ mod tests {
        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store.clone());
        let manager_ctx = Arc::new(ManagerContext::new());
+        manager_ctx.start();
        // Manually add this procedure to the manager ctx.
        assert!(manager_ctx.try_insert_procedure(meta));
        // Replace the manager ctx.
@@ -769,6 +805,70 @@ mod tests {
        }
    }

+    #[tokio::test]
+    async fn test_running_is_stopped() {
+        let exec_fn = move |_| async move { Ok(Status::Executing { persist: true }) }.boxed();
+        let normal = ProcedureAdapter {
+            data: "normal".to_string(),
+            lock_key: LockKey::single("catalog.schema.table"),
+            exec_fn,
+        };
+
+        let dir = create_temp_dir("test_running_is_stopped");
+        let meta = normal.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
+        runner.manager_ctx.start();
+
+        let res = runner.execute_once(&ctx).await;
+        assert!(res.is_continue(), "{res:?}");
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.step"],
+        )
+        .await;
+
+        runner.manager_ctx.stop();
+        let res = runner.execute_once(&ctx).await;
+        assert!(res.is_failed());
+        // Shouldn't write any files
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.step"],
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_running_is_stopped_on_error() {
+        let exec_fn =
+            |_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
+        let normal = ProcedureAdapter {
+            data: "fail".to_string(),
+            lock_key: LockKey::single("catalog.schema.table"),
+            exec_fn,
+        };
+
+        let dir = create_temp_dir("test_running_is_stopped_on_error");
+        let meta = normal.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta, Box::new(normal), procedure_store.clone());
+        runner.manager_ctx.stop();
+
+        let res = runner.execute_once(&ctx).await;
+        assert!(res.is_failed(), "{res:?}");
+        // Shouldn't write any files
+        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
+    }
+
    #[tokio::test]
    async fn test_execute_on_error() {
        let exec_fn =
@@ -785,6 +885,7 @@ mod tests {
        let object_store = test_util::new_object_store(&dir);
        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
+        runner.manager_ctx.start();

        let res = runner.execute_once(&ctx).await;
        assert!(res.is_failed(), "{res:?}");
@@ -826,6 +927,7 @@ mod tests {
        let object_store = test_util::new_object_store(&dir);
        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
+        runner.manager_ctx.start();

        let res = runner.execute_once(&ctx).await;
        assert!(res.is_retry_later(), "{res:?}");
@@ -863,6 +965,8 @@ mod tests {
            Box::new(exceed_max_retry_later),
            procedure_store,
        );
+        runner.manager_ctx.start();
+
        runner.exponential_builder = ExponentialBuilder::default()
            .with_min_delay(Duration::from_millis(1))
            .with_max_times(3);
@@ -933,8 +1037,8 @@ mod tests {
        let object_store = test_util::new_object_store(&dir);
        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
-
        let manager_ctx = Arc::new(ManagerContext::new());
+        manager_ctx.start();
        // Manually add this procedure to the manager ctx.
        assert!(manager_ctx.try_insert_procedure(meta.clone()));
        // Replace the manager ctx.
@@ -943,6 +1047,6 @@ mod tests {
        // Run the runner and execute the procedure.
        runner.run().await;
        let err = meta.state().error().unwrap().output_msg();
-        assert!(err.contains("subprocedure failed"), "{err}");
+        assert!(err.contains("Internal error"), "{err}");
    }
 }
--- a/src/common/procedure/src/procedure.rs
+++ b/src/common/procedure/src/procedure.rs
@@ -279,8 +279,14 @@ pub trait ProcedureManager: Send + Sync + 'static {
    /// Registers loader for specific procedure type `name`.
    fn register_loader(&self, name: &str, loader: BoxedProcedureLoader) -> Result<()>;

-    fn start(&self) -> Result<()>;
+    /// Starts the background GC task.
+    ///
+    /// Recovers unfinished procedures and reruns them.
+    ///
+    /// Callers should ensure all loaders are registered.
+    async fn start(&self) -> Result<()>;

+    /// Stops the background GC task.
    async fn stop(&self) -> Result<()>;

    /// Submits a procedure to execute.
@@ -288,11 +294,6 @@ pub trait ProcedureManager: Send + Sync + 'static {
    /// Returns a [Watcher] to watch the created procedure.
    async fn submit(&self, procedure: ProcedureWithId) -> Result<Watcher>;

-    /// Recovers unfinished procedures and reruns them.
-    ///
-    /// Callers should ensure all loaders are registered.
-    async fn recover(&self) -> Result<()>;
-
    /// Query the procedure state.
    ///
    /// Returns `Ok(None)` if the procedure doesn't exist.
--- a/src/common/procedure/src/watcher.rs
+++ b/src/common/procedure/src/watcher.rs
@@ -71,6 +71,7 @@ mod tests {
        };
        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
        let manager = LocalManager::new(config, state_store);
+        manager.start().await.unwrap();

        #[derive(Debug)]
        struct MockProcedure {
--- a/src/common/recordbatch/src/recordbatch.rs
+++ b/src/common/recordbatch/src/recordbatch.rs
@@ -13,8 +13,10 @@
 // limitations under the License.

 use std::collections::HashMap;
+use std::slice;
 use std::sync::Arc;

+use datafusion::arrow::util::pretty::pretty_format_batches;
 use datatypes::schema::SchemaRef;
 use datatypes::value::Value;
 use datatypes::vectors::{Helper, VectorRef};
@@ -169,6 +171,13 @@ impl RecordBatch {

        Ok(vectors)
    }
+
+    /// Pretty display this record batch like a table
+    pub fn pretty_print(&self) -> String {
+        pretty_format_batches(slice::from_ref(&self.df_record_batch))
+            .map(|t| t.to_string())
+            .unwrap_or("failed to pretty display a record batch".to_string())
+    }
 }

 impl Serialize for RecordBatch {
--- a/src/common/runtime/src/repeated_task.rs
+++ b/src/common/runtime/src/repeated_task.rs
@@ -40,6 +40,7 @@ pub type BoxedTaskFunction<E> = Box<dyn TaskFunction<E> + Send + Sync + 'static>
 struct TaskInner<E> {
    /// The repeated task handle. This handle is Some if the task is started.
    task_handle: Option<JoinHandle<()>>,
+
    /// The task_fn to run. This is Some if the task is not started.
    task_fn: Option<BoxedTaskFunction<E>>,
 }
@@ -50,6 +51,7 @@ pub struct RepeatedTask<E> {
    inner: Mutex<TaskInner<E>>,
    started: AtomicBool,
    interval: Duration,
+    initial_delay: Option<Duration>,
 }

 impl<E> std::fmt::Display for RepeatedTask<E> {
@@ -75,6 +77,9 @@ impl<E> Drop for RepeatedTask<E> {
 }

 impl<E: ErrorExt + 'static> RepeatedTask<E> {
+    /// Creates a new repeated task. The `initial_delay` is the delay before the first execution.
+    /// `initial_delay` default is None, the initial interval uses the `interval`.
+    /// You can use `with_initial_delay` to set the `initial_delay`.
    pub fn new(interval: Duration, task_fn: BoxedTaskFunction<E>) -> Self {
        Self {
            name: task_fn.name().to_string(),
@@ -85,9 +90,15 @@ impl<E: ErrorExt + 'static> RepeatedTask<E> {
            }),
            started: AtomicBool::new(false),
            interval,
+            initial_delay: None,
        }
    }

+    pub fn with_initial_delay(mut self, initial_delay: Option<Duration>) -> Self {
+        self.initial_delay = initial_delay;
+        self
+    }
+
    pub fn started(&self) -> bool {
        self.started.load(Ordering::Relaxed)
    }
@@ -99,17 +110,21 @@ impl<E: ErrorExt + 'static> RepeatedTask<E> {
            IllegalStateSnafu { name: &self.name }
        );

-        let interval = self.interval;
        let child = self.cancel_token.child_token();
        // Safety: The task is not started.
        let mut task_fn = inner.task_fn.take().unwrap();
+        let interval = self.interval;
+        let mut initial_delay = self.initial_delay;
        // TODO(hl): Maybe spawn to a blocking runtime.
        let handle = runtime.spawn(async move {
            loop {
-                tokio::select! {
-                    _ = tokio::time::sleep(interval) => {}
-                    _ = child.cancelled() => {
-                        return;
+                let sleep_time = initial_delay.take().unwrap_or(interval);
+                if sleep_time > Duration::ZERO {
+                    tokio::select! {
+                        _ = tokio::time::sleep(sleep_time) => {}
+                        _ = child.cancelled() => {
+                            return;
+                        }
                    }
                }
                if let Err(e) = task_fn.call().await {
@@ -192,4 +207,21 @@ mod tests {

        assert_eq!(n.load(Ordering::Relaxed), 5);
    }
+
+    #[tokio::test]
+    async fn test_repeated_task_prior_exec() {
+        common_telemetry::init_default_ut_logging();
+
+        let n = Arc::new(AtomicI32::new(0));
+        let task_fn = TickTask { n: n.clone() };
+
+        let task = RepeatedTask::new(Duration::from_millis(100), Box::new(task_fn))
+            .with_initial_delay(Some(Duration::ZERO));
+
+        task.start(crate::bg_runtime()).unwrap();
+        tokio::time::sleep(Duration::from_millis(550)).await;
+        task.stop().await.unwrap();
+
+        assert_eq!(n.load(Ordering::Relaxed), 6);
+    }
 }
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -367,7 +367,10 @@ impl DatanodeBuilder {
    /// Build [RaftEngineLogStore]
    async fn build_log_store(opts: &DatanodeOptions) -> Result<Arc<RaftEngineLogStore>> {
        let data_home = normalize_dir(&opts.storage.data_home);
-        let wal_dir = format!("{}{WAL_DIR}", data_home);
+        let wal_dir = match &opts.wal.dir {
+            Some(dir) => dir.clone(),
+            None => format!("{}{WAL_DIR}", data_home),
+        };
        let wal_config = opts.wal.clone();

        // create WAL directory
--- a/src/datanode/src/greptimedb_telemetry.rs
+++ b/src/datanode/src/greptimedb_telemetry.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::sync::atomic::AtomicBool;
 use std::sync::Arc;

 use async_trait::async_trait;
@@ -60,6 +61,8 @@ pub async fn get_greptimedb_telemetry_task(
    if !enable || cfg!(test) || cfg!(debug_assertions) {
        return Arc::new(GreptimeDBTelemetryTask::disable());
    }
+    // Always enable.
+    let should_report = Arc::new(AtomicBool::new(true));

    match mode {
        Mode::Standalone => Arc::new(GreptimeDBTelemetryTask::enable(
@@ -70,7 +73,9 @@ pub async fn get_greptimedb_telemetry_task(
                    uuid: default_get_uuid(&working_home),
                    retry: 0,
                }),
+                should_report.clone(),
            )),
+            should_report,
        )),
        Mode::Distributed => Arc::new(GreptimeDBTelemetryTask::disable()),
    }
--- a/src/datatypes/src/value.rs
+++ b/src/datatypes/src/value.rs
@@ -654,6 +654,17 @@ impl ListValue {
            Arc::new(new_item_field(output_type.item_type().as_arrow_type())),
        ))
    }
+
+    /// use 'the first item size' * 'length of items' to estimate the size.
+    /// it could be inaccurate.
+    fn estimated_size(&self) -> usize {
+        if let Some(items) = &self.items {
+            if let Some(item) = items.first() {
+                return item.as_value_ref().data_size() * items.len();
+            }
+        }
+        0
+    }
 }

 impl Default for ListValue {
@@ -1090,12 +1101,46 @@ impl<'a> PartialOrd for ListValueRef<'a> {
    }
 }

+impl<'a> ValueRef<'a> {
+    /// Returns the size of the underlying data in bytes,
+    /// The size is estimated and only considers the data size.
+    pub fn data_size(&self) -> usize {
+        match *self {
+            ValueRef::Null => 0,
+            ValueRef::Boolean(_) => 1,
+            ValueRef::UInt8(_) => 1,
+            ValueRef::UInt16(_) => 2,
+            ValueRef::UInt32(_) => 4,
+            ValueRef::UInt64(_) => 8,
+            ValueRef::Int8(_) => 1,
+            ValueRef::Int16(_) => 2,
+            ValueRef::Int32(_) => 4,
+            ValueRef::Int64(_) => 8,
+            ValueRef::Float32(_) => 4,
+            ValueRef::Float64(_) => 8,
+            ValueRef::String(v) => std::mem::size_of_val(v),
+            ValueRef::Binary(v) => std::mem::size_of_val(v),
+            ValueRef::Date(_) => 4,
+            ValueRef::DateTime(_) => 8,
+            ValueRef::Timestamp(_) => 16,
+            ValueRef::Time(_) => 16,
+            ValueRef::Duration(_) => 16,
+            ValueRef::Interval(_) => 24,
+            ValueRef::List(v) => match v {
+                ListValueRef::Indexed { vector, .. } => vector.memory_size() / vector.len(),
+                ListValueRef::Ref { val } => val.estimated_size(),
+            },
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use arrow::datatypes::DataType as ArrowDataType;
    use num_traits::Float;

    use super::*;
+    use crate::vectors::ListVectorBuilder;

    #[test]
    fn test_try_from_scalar_value() {
@@ -2158,4 +2203,90 @@ mod tests {
            duration_to_scalar_value(TimeUnit::Nanosecond, Some(1))
        );
    }
+
+    fn check_value_ref_size_eq(value_ref: &ValueRef, size: usize) {
+        assert_eq!(value_ref.data_size(), size);
+    }
+
+    #[test]
+    fn test_value_ref_estimated_size() {
+        assert_eq!(std::mem::size_of::<ValueRef>(), 24);
+
+        check_value_ref_size_eq(&ValueRef::Boolean(true), 1);
+        check_value_ref_size_eq(&ValueRef::UInt8(1), 1);
+        check_value_ref_size_eq(&ValueRef::UInt16(1), 2);
+        check_value_ref_size_eq(&ValueRef::UInt32(1), 4);
+        check_value_ref_size_eq(&ValueRef::UInt64(1), 8);
+        check_value_ref_size_eq(&ValueRef::Int8(1), 1);
+        check_value_ref_size_eq(&ValueRef::Int16(1), 2);
+        check_value_ref_size_eq(&ValueRef::Int32(1), 4);
+        check_value_ref_size_eq(&ValueRef::Int64(1), 8);
+        check_value_ref_size_eq(&ValueRef::Float32(1.0.into()), 4);
+        check_value_ref_size_eq(&ValueRef::Float64(1.0.into()), 8);
+        check_value_ref_size_eq(&ValueRef::String("greptimedb"), 10);
+        check_value_ref_size_eq(&ValueRef::Binary(b"greptimedb"), 10);
+        check_value_ref_size_eq(&ValueRef::Date(Date::new(1)), 4);
+        check_value_ref_size_eq(&ValueRef::DateTime(DateTime::new(1)), 8);
+        check_value_ref_size_eq(&ValueRef::Timestamp(Timestamp::new_millisecond(1)), 16);
+        check_value_ref_size_eq(&ValueRef::Time(Time::new_millisecond(1)), 16);
+        check_value_ref_size_eq(
+            &ValueRef::Interval(Interval::from_month_day_nano(1, 2, 3)),
+            24,
+        );
+        check_value_ref_size_eq(&ValueRef::Duration(Duration::new_millisecond(1)), 16);
+        check_value_ref_size_eq(
+            &ValueRef::List(ListValueRef::Ref {
+                val: &ListValue {
+                    items: Some(Box::new(vec![
+                        Value::String("hello world".into()),
+                        Value::String("greptimedb".into()),
+                    ])),
+                    datatype: ConcreteDataType::string_datatype(),
+                },
+            }),
+            22,
+        );
+
+        let data = vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            None,
+            Some(vec![Some(4), None, Some(6)]),
+        ];
+        let mut builder =
+            ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8);
+        for vec_opt in &data {
+            if let Some(vec) = vec_opt {
+                let values = vec.iter().map(|v| Value::from(*v)).collect();
+                let values = Some(Box::new(values));
+                let list_value = ListValue::new(values, ConcreteDataType::int32_datatype());
+
+                builder.push(Some(ListValueRef::Ref { val: &list_value }));
+            } else {
+                builder.push(None);
+            }
+        }
+        let vector = builder.finish();
+
+        check_value_ref_size_eq(
+            &ValueRef::List(ListValueRef::Indexed {
+                vector: &vector,
+                idx: 0,
+            }),
+            85,
+        );
+        check_value_ref_size_eq(
+            &ValueRef::List(ListValueRef::Indexed {
+                vector: &vector,
+                idx: 1,
+            }),
+            85,
+        );
+        check_value_ref_size_eq(
+            &ValueRef::List(ListValueRef::Indexed {
+                vector: &vector,
+                idx: 2,
+            }),
+            85,
+        )
+    }
 }
--- a/src/frontend/src/instance/opentsdb.rs
+++ b/src/frontend/src/instance/opentsdb.rs
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use api::v1::InsertRequests;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_error::ext::BoxedError;
 use servers::error as server_error;
 use servers::error::AuthSnafu;
 use servers::opentsdb::codec::DataPoint;
+use servers::opentsdb::data_point_to_grpc_row_insert_requests;
 use servers::query_handler::OpentsdbProtocolHandler;
 use session::context::QueryContextRef;
 use snafu::prelude::*;
@@ -27,23 +27,27 @@ use crate::instance::Instance;

 #[async_trait]
 impl OpentsdbProtocolHandler for Instance {
-    async fn exec(&self, data_point: &DataPoint, ctx: QueryContextRef) -> server_error::Result<()> {
+    async fn exec(
+        &self,
+        data_points: Vec<DataPoint>,
+        ctx: QueryContextRef,
+    ) -> server_error::Result<usize> {
        self.plugins
            .get::<PermissionCheckerRef>()
            .as_ref()
            .check_permission(ctx.current_user(), PermissionReq::Opentsdb)
            .context(AuthSnafu)?;

-        let requests = InsertRequests {
-            inserts: vec![data_point.as_grpc_insert()],
-        };
-        let _ = self
-            .handle_inserts(requests, ctx)
+        let (requests, _) = data_point_to_grpc_row_insert_requests(data_points)?;
+        let output = self
+            .handle_row_inserts(requests, ctx)
            .await
            .map_err(BoxedError::new)
-            .with_context(|_| server_error::ExecuteQuerySnafu {
-                query: format!("{data_point:?}"),
-            })?;
-        Ok(())
+            .context(servers::error::ExecuteGrpcQuerySnafu)?;
+
+        Ok(match output {
+            common_query::Output::AffectedRows(rows) => rows,
+            _ => unreachable!(),
+        })
    }
 }
--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -19,14 +19,18 @@ use metrics::counter;
 use opentelemetry_proto::tonic::collector::metrics::v1::{
    ExportMetricsServiceRequest, ExportMetricsServiceResponse,
 };
+use opentelemetry_proto::tonic::collector::trace::v1::{
+    ExportTraceServiceRequest, ExportTraceServiceResponse,
+};
 use servers::error::{self, AuthSnafu, Result as ServerResult};
 use servers::otlp;
+use servers::otlp::plugin::TraceParserRef;
 use servers::query_handler::OpenTelemetryProtocolHandler;
 use session::context::QueryContextRef;
 use snafu::ResultExt;

 use crate::instance::Instance;
-use crate::metrics::OTLP_METRICS_ROWS;
+use crate::metrics::{OTLP_METRICS_ROWS, OTLP_TRACES_ROWS};

 #[async_trait]
 impl OpenTelemetryProtocolHandler for Instance {
@@ -40,7 +44,7 @@ impl OpenTelemetryProtocolHandler for Instance {
            .as_ref()
            .check_permission(ctx.current_user(), PermissionReq::Otlp)
            .context(AuthSnafu)?;
-        let (requests, rows) = otlp::to_grpc_insert_requests(request)?;
+        let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request)?;
        let _ = self
            .handle_row_inserts(requests, ctx)
            .await
@@ -55,4 +59,40 @@ impl OpenTelemetryProtocolHandler for Instance {
        };
        Ok(resp)
    }
+
+    async fn traces(
+        &self,
+        request: ExportTraceServiceRequest,
+        ctx: QueryContextRef,
+    ) -> ServerResult<ExportTraceServiceResponse> {
+        self.plugins
+            .get::<PermissionCheckerRef>()
+            .as_ref()
+            .check_permission(ctx.current_user(), PermissionReq::Otlp)
+            .context(AuthSnafu)?;
+
+        let (table_name, spans) = match self.plugins.get::<TraceParserRef>() {
+            Some(parser) => (parser.table_name(), parser.parse(request)),
+            None => (
+                otlp::trace::TRACE_TABLE_NAME.to_string(),
+                otlp::trace::parse(request),
+            ),
+        };
+
+        let (requests, rows) = otlp::trace::to_grpc_insert_requests(table_name, spans)?;
+
+        let _ = self
+            .handle_row_inserts(requests, ctx)
+            .await
+            .map_err(BoxedError::new)
+            .context(error::ExecuteGrpcQuerySnafu)?;
+
+        counter!(OTLP_TRACES_ROWS, rows as u64);
+
+        let resp = ExportTraceServiceResponse {
+            // TODO(fys): add support for partial_success in future patch
+            partial_success: None,
+        };
+        Ok(resp)
+    }
 }
--- a/src/frontend/src/metrics.rs
+++ b/src/frontend/src/metrics.rs
@@ -22,3 +22,4 @@ pub(crate) const METRIC_RUN_SCRIPT_ELAPSED: &str = "frontend.run_script_elapsed"
 pub const PROM_STORE_REMOTE_WRITE_SAMPLES: &str = "frontend.prometheus.remote_write.samples";

 pub const OTLP_METRICS_ROWS: &str = "frontend.otlp.metrics.rows";
+pub const OTLP_TRACES_ROWS: &str = "frontend.otlp.traces.rows";
--- a/src/log-store/Cargo.toml
+++ b/src/log-store/Cargo.toml
@@ -19,7 +19,7 @@ common-base = { workspace = true }
 common-config = { workspace = true }
 common-error = { workspace = true }
 common-macro = { workspace = true }
-common-meta = { workspace = true }
+common-meta = { workspace = true, features = ["testing"] }
 common-runtime = { workspace = true }
 common-telemetry = { workspace = true }
 futures-util.workspace = true
--- a/src/log-store/src/raft_engine/backend.rs
+++ b/src/log-store/src/raft_engine/backend.rs
@@ -15,6 +15,7 @@
 //! [KvBackend] implementation based on [raft_engine::Engine].

 use std::any::Any;
+use std::ops::Bound::{Excluded, Included, Unbounded};
 use std::sync::RwLock;

 use common_error::ext::BoxedError;
@@ -28,6 +29,7 @@ use common_meta::rpc::store::{
    RangeRequest, RangeResponse,
 };
 use common_meta::rpc::KeyValue;
+use common_meta::util::get_next_prefix_key;
 use raft_engine::{Config, Engine, LogBatch};
 use snafu::ResultExt;

@@ -137,29 +139,48 @@ impl KvBackend for RaftEngineBackend {

    async fn range(&self, req: RangeRequest) -> Result<RangeResponse, Self::Error> {
        let mut res = vec![];
+        let (start, end) = req.range();
+        let RangeRequest {
+            keys_only, limit, ..
+        } = req;
+
+        let (start_key, end_key) = match (start, end) {
+            (Included(start), Included(end)) => (Some(start), Some(get_next_prefix_key(&end))),
+            (Unbounded, Unbounded) => (None, None),
+            (Included(start), Excluded(end)) => (Some(start), Some(end)),
+            (Included(start), Unbounded) => (Some(start), None),
+            _ => unreachable!(),
+        };
+        let mut more = false;
+        let mut iter = 0;
+
        self.engine
            .read()
            .unwrap()
            .scan_raw_messages(
                SYSTEM_NAMESPACE,
-                Some(&req.key),
-                Some(&req.range_end),
+                start_key.as_deref(),
+                end_key.as_deref(),
                false,
                |key, value| {
-                    res.push(KeyValue {
-                        key: key.to_vec(),
-                        value: value.to_vec(),
-                    });
-                    true
+                    let take = limit == 0 || iter != limit;
+                    iter += 1;
+                    more = limit > 0 && iter > limit;
+
+                    if take {
+                        res.push(KeyValue {
+                            key: key.to_vec(),
+                            value: if keys_only { vec![] } else { value.to_vec() },
+                        });
+                    }
+
+                    take
                },
            )
            .context(RaftEngineSnafu)
            .map_err(BoxedError::new)
            .context(meta_error::ExternalSnafu)?;
-        Ok(RangeResponse {
-            kvs: res,
-            more: false,
-        })
+        Ok(RangeResponse { kvs: res, more })
    }

    async fn put(&self, req: PutRequest) -> Result<PutResponse, Self::Error> {
@@ -275,7 +296,7 @@ impl KvBackend for RaftEngineBackend {
            key,
            range_end,
            limit: 0,
-            keys_only: true,
+            keys_only: false,
        };
        let range_resp = self.range(range).await?;

@@ -383,7 +404,12 @@ fn engine_delete(engine: &Engine, key: &[u8]) -> meta_error::Result<()> {
 #[cfg(test)]
 mod tests {
    use std::collections::HashSet;
+    use std::sync::Arc;

+    use common_meta::kv_backend::test::{
+        prepare_kv, test_kv_batch_delete, test_kv_batch_get, test_kv_compare_and_put,
+        test_kv_delete_range, test_kv_put, test_kv_range, test_kv_range_2,
+    };
    use common_test_util::temp_dir::create_temp_dir;
    use raft_engine::{Config, ReadableSize, RecoveryMode};

@@ -615,4 +641,66 @@ mod tests {
            keys
        );
    }
+
+    #[tokio::test]
+    async fn test_range() {
+        let dir = create_temp_dir("range");
+        let backend = build_kv_backend(dir.path().to_str().unwrap().to_string());
+        prepare_kv(&backend).await;
+
+        test_kv_range(backend).await;
+    }
+
+    #[tokio::test]
+    async fn test_range_2() {
+        let dir = create_temp_dir("range2");
+        let backend = build_kv_backend(dir.path().to_str().unwrap().to_string());
+
+        test_kv_range_2(backend).await;
+    }
+
+    #[tokio::test]
+    async fn test_put() {
+        let dir = create_temp_dir("put");
+        let backend = build_kv_backend(dir.path().to_str().unwrap().to_string());
+        prepare_kv(&backend).await;
+
+        test_kv_put(backend).await;
+    }
+
+    #[tokio::test]
+    async fn test_batch_get() {
+        let dir = create_temp_dir("batch_get");
+        let backend = build_kv_backend(dir.path().to_str().unwrap().to_string());
+        prepare_kv(&backend).await;
+
+        test_kv_batch_get(backend).await;
+    }
+
+    #[tokio::test]
+    async fn test_batch_delete() {
+        let dir = create_temp_dir("batch_delete");
+        let backend = build_kv_backend(dir.path().to_str().unwrap().to_string());
+        prepare_kv(&backend).await;
+
+        test_kv_batch_delete(backend).await;
+    }
+
+    #[tokio::test]
+    async fn test_delete_range() {
+        let dir = create_temp_dir("delete_range");
+        let backend = build_kv_backend(dir.path().to_str().unwrap().to_string());
+        prepare_kv(&backend).await;
+
+        test_kv_delete_range(backend).await;
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_compare_and_put_2() {
+        let dir = create_temp_dir("compare_and_put");
+        let backend = build_kv_backend(dir.path().to_str().unwrap().to_string());
+        prepare_kv(&backend).await;
+
+        test_kv_compare_and_put(Arc::new(backend)).await;
+    }
 }
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -111,8 +111,7 @@ impl MetaSrvInstance {
                .await
                .context(error::SendShutdownSignalSnafu)?;
        }
-
-        self.meta_srv.shutdown();
+        self.meta_srv.shutdown().await?;
        self.http_srv
            .shutdown()
            .await
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -41,6 +41,12 @@ pub enum Error {
        source: common_meta::error::Error,
    },

+    #[snafu(display("Failed to start telemetry task"))]
+    StartTelemetryTask {
+        location: Location,
+        source: common_runtime::error::Error,
+    },
+
    #[snafu(display("Failed to submit ddl task"))]
    SubmitDdlTask {
        location: Location,
@@ -393,8 +399,14 @@ pub enum Error {
    #[snafu(display("Missing required parameter, param: {:?}", param))]
    MissingRequiredParameter { param: String },

-    #[snafu(display("Failed to recover procedure"))]
-    RecoverProcedure {
+    #[snafu(display("Failed to start procedure manager"))]
+    StartProcedureManager {
+        location: Location,
+        source: common_procedure::Error,
+    },
+
+    #[snafu(display("Failed to stop procedure manager"))]
+    StopProcedureManager {
        location: Location,
        source: common_procedure::Error,
    },
@@ -616,16 +628,19 @@ impl ErrorExt for Error {
            Error::RequestDatanode { source, .. } => source.status_code(),
            Error::InvalidCatalogValue { source, .. }
            | Error::InvalidFullTableName { source, .. } => source.status_code(),
-            Error::RecoverProcedure { source, .. }
-            | Error::SubmitProcedure { source, .. }
-            | Error::WaitProcedure { source, .. } => source.status_code(),
+            Error::SubmitProcedure { source, .. } | Error::WaitProcedure { source, .. } => {
+                source.status_code()
+            }
            Error::ShutdownServer { source, .. } | Error::StartHttp { source, .. } => {
                source.status_code()
            }
+            Error::StartProcedureManager { source, .. }
+            | Error::StopProcedureManager { source, .. } => source.status_code(),

            Error::ListCatalogs { source, .. } | Error::ListSchemas { source, .. } => {
                source.status_code()
            }
+            Error::StartTelemetryTask { source, .. } => source.status_code(),

            Error::RegionFailoverCandidatesNotFound { .. } => StatusCode::RuntimeResourcesExhausted,
            Error::NextSequence { source, .. } => source.status_code(),
--- a/src/meta-srv/src/greptimedb_telemetry.rs
+++ b/src/meta-srv/src/greptimedb_telemetry.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::sync::atomic::AtomicBool;
 use std::sync::Arc;

 use async_trait::async_trait;
@@ -63,7 +64,8 @@ pub async fn get_greptimedb_telemetry_task(
    if !enable || cfg!(test) || cfg!(debug_assertions) {
        return Arc::new(GreptimeDBTelemetryTask::disable());
    }
-
+    // Controlled by meta server state, only leader reports the info.
+    let should_report = Arc::new(AtomicBool::new(false));
    Arc::new(GreptimeDBTelemetryTask::enable(
        TELEMETRY_INTERVAL,
        Box::new(GreptimeDBTelemetry::new(
@@ -73,6 +75,8 @@ pub async fn get_greptimedb_telemetry_task(
                uuid: default_get_uuid(&working_home),
                retry: 0,
            }),
+            should_report.clone(),
        )),
+        should_report,
    ))
 }
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -28,7 +28,7 @@ use common_meta::sequence::SequenceRef;
 use common_procedure::options::ProcedureConfig;
 use common_procedure::ProcedureManagerRef;
 use common_telemetry::logging::LoggingOptions;
-use common_telemetry::{debug, error, info, warn};
+use common_telemetry::{error, info, warn};
 use serde::{Deserialize, Serialize};
 use servers::http::HttpOptions;
 use snafu::ResultExt;
@@ -37,7 +37,10 @@ use tokio::sync::broadcast::error::RecvError;

 use crate::cluster::MetaPeerClientRef;
 use crate::election::{Election, LeaderChangeMessage};
-use crate::error::{InitMetadataSnafu, RecoverProcedureSnafu, Result};
+use crate::error::{
+    InitMetadataSnafu, Result, StartProcedureManagerSnafu, StartTelemetryTaskSnafu,
+    StopProcedureManagerSnafu,
+};
 use crate::handler::HeartbeatHandlerGroup;
 use crate::lock::DistLockRef;
 use crate::pubsub::{PublishRef, SubscribeManagerRef};
@@ -169,6 +172,37 @@ pub struct SelectorContext {
 pub type SelectorRef = Arc<dyn Selector<Context = SelectorContext, Output = Vec<Peer>>>;
 pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;

+pub struct MetaStateHandler {
+    procedure_manager: ProcedureManagerRef,
+    subscribe_manager: Option<SubscribeManagerRef>,
+    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
+}
+
+impl MetaStateHandler {
+    pub async fn on_become_leader(&self) {
+        if let Err(e) = self.procedure_manager.start().await {
+            error!(e; "Failed to start procedure manager");
+        }
+        self.greptimedb_telemetry_task.should_report(true);
+    }
+
+    pub async fn on_become_follower(&self) {
+        // Stops the procedures.
+        if let Err(e) = self.procedure_manager.stop().await {
+            error!(e; "Failed to stop procedure manager");
+        }
+        // Suspends reporting.
+        self.greptimedb_telemetry_task.should_report(false);
+
+        if let Some(sub_manager) = self.subscribe_manager.clone() {
+            info!("Leader changed, un_subscribe all");
+            if let Err(e) = sub_manager.un_subscribe_all() {
+                error!("Failed to un_subscribe all, error: {}", e);
+            }
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct MetaSrv {
    started: Arc<AtomicBool>,
@@ -212,7 +246,15 @@ impl MetaSrv {
            let leader_cached_kv_store = self.leader_cached_kv_store.clone();
            let subscribe_manager = self.subscribe_manager();
            let mut rx = election.subscribe_leader_change();
-            let task_handler = self.greptimedb_telemetry_task.clone();
+            let greptimedb_telemetry_task = self.greptimedb_telemetry_task.clone();
+            greptimedb_telemetry_task
+                .start()
+                .context(StartTelemetryTaskSnafu)?;
+            let state_handler = MetaStateHandler {
+                greptimedb_telemetry_task,
+                subscribe_manager,
+                procedure_manager,
+            };
            let _handle = common_runtime::spawn_bg(async move {
                loop {
                    match rx.recv().await {
@@ -225,28 +267,12 @@ impl MetaSrv {
                            );
                            match msg {
                                LeaderChangeMessage::Elected(_) => {
-                                    if let Err(e) = procedure_manager.recover().await {
-                                        error!("Failed to recover procedures, error: {e}");
-                                    }
-                                    let _ = task_handler.start().map_err(|e| {
-                                        debug!(
-                                            "Failed to start greptimedb telemetry task, error: {e}"
-                                        );
-                                    });
+                                    state_handler.on_become_leader().await;
                                }
                                LeaderChangeMessage::StepDown(leader) => {
-                                    if let Some(sub_manager) = subscribe_manager.clone() {
-                                        info!("Leader changed, un_subscribe all");
-                                        if let Err(e) = sub_manager.un_subscribe_all() {
-                                            error!("Failed to un_subscribe all, error: {}", e);
-                                        }
-                                    }
                                    error!("Leader :{:?} step down", leader);
-                                    let _ = task_handler.stop().await.map_err(|e| {
-                                        debug!(
-                                            "Failed to stop greptimedb telemetry task, error: {e}"
-                                        );
-                                    });
+
+                                    state_handler.on_become_follower().await;
                                }
                            }
                        }
@@ -259,6 +285,8 @@ impl MetaSrv {
                        }
                    }
                }
+
+                state_handler.on_become_follower().await;
            });

            let election = election.clone();
@@ -275,9 +303,9 @@ impl MetaSrv {
            });
        } else {
            self.procedure_manager
-                .recover()
+                .start()
                .await
-                .context(RecoverProcedureSnafu)?;
+                .context(StartProcedureManagerSnafu)?;
        }

        info!("MetaSrv started");
@@ -291,8 +319,12 @@ impl MetaSrv {
            .context(InitMetadataSnafu)
    }

-    pub fn shutdown(&self) {
+    pub async fn shutdown(&self) -> Result<()> {
        self.started.store(false, Ordering::Relaxed);
+        self.procedure_manager
+            .stop()
+            .await
+            .context(StopProcedureManagerSnafu)
    }

    #[inline]
--- a/src/mito2/src/engine/create_test.rs
+++ b/src/mito2/src/engine/create_test.rs
@@ -15,7 +15,7 @@
 use std::time::Duration;

 use store_api::region_engine::RegionEngine;
-use store_api::region_request::RegionRequest;
+use store_api::region_request::{RegionCloseRequest, RegionRequest};
 use store_api::storage::RegionId;

 use crate::config::MitoConfig;
@@ -55,6 +55,37 @@ async fn test_engine_create_existing_region() {
        .unwrap();
 }

+#[tokio::test]
+async fn test_engine_create_close_create_region() {
+    // This test will trigger create_or_open function.
+    let mut env = TestEnv::with_prefix("create-close-create");
+    let engine = env.create_engine(MitoConfig::default()).await;
+
+    let region_id = RegionId::new(1, 1);
+    let builder = CreateRequestBuilder::new();
+    // Create a region with id 1.
+    engine
+        .handle_request(region_id, RegionRequest::Create(builder.build()))
+        .await
+        .unwrap();
+    // Close the region.
+    engine
+        .handle_request(region_id, RegionRequest::Close(RegionCloseRequest {}))
+        .await
+        .unwrap();
+    // Create the same region id again.
+    engine
+        .handle_request(region_id, RegionRequest::Create(builder.build()))
+        .await
+        .unwrap();
+
+    assert!(engine.is_region_exists(region_id));
+
+    let region = engine.get_region(region_id).unwrap();
+
+    assert!(region.is_writable());
+}
+
 #[tokio::test]
 async fn test_engine_create_with_different_id() {
    let mut env = TestEnv::new();
--- a/src/mito2/src/engine/prune_test.rs
+++ b/src/mito2/src/engine/prune_test.rs
@@ -17,7 +17,7 @@ use common_query::logical_plan::DfExpr;
 use common_query::prelude::Expr;
 use common_recordbatch::RecordBatches;
 use datafusion_common::ScalarValue;
-use datafusion_expr::lit;
+use datafusion_expr::{col, lit};
 use store_api::region_engine::RegionEngine;
 use store_api::region_request::RegionRequest;
 use store_api::storage::{RegionId, ScanRequest};
@@ -46,7 +46,7 @@ async fn check_prune_row_groups(expr: DfExpr, expected: &str) {
        region_id,
        Rows {
            schema: column_schemas.clone(),
-            rows: build_rows(0, 10),
+            rows: build_rows(0, 15),
        },
    )
    .await;
@@ -76,6 +76,16 @@ async fn test_read_parquet_stats() {
 +-------+---------+---------------------+
 | tag_0 | field_0 | ts                  |
 +-------+---------+---------------------+
+| 0     | 0.0     | 1970-01-01T00:00:00 |
+| 1     | 1.0     | 1970-01-01T00:00:01 |
+| 10    | 10.0    | 1970-01-01T00:00:10 |
+| 11    | 11.0    | 1970-01-01T00:00:11 |
+| 12    | 12.0    | 1970-01-01T00:00:12 |
+| 13    | 13.0    | 1970-01-01T00:00:13 |
+| 14    | 14.0    | 1970-01-01T00:00:14 |
+| 2     | 2.0     | 1970-01-01T00:00:02 |
+| 3     | 3.0     | 1970-01-01T00:00:03 |
+| 4     | 4.0     | 1970-01-01T00:00:04 |
 | 5     | 5.0     | 1970-01-01T00:00:05 |
 | 6     | 6.0     | 1970-01-01T00:00:06 |
 | 7     | 7.0     | 1970-01-01T00:00:07 |
@@ -84,7 +94,11 @@ async fn test_read_parquet_stats() {
 +-------+---------+---------------------+",
    )
    .await;
+}

+#[tokio::test]
+async fn test_prune_tag() {
+    // prune result: only row group 1&2
    check_prune_row_groups(
        datafusion_expr::col("tag_0").gt(lit(ScalarValue::Utf8(Some("4".to_string())))),
        "\
@@ -100,3 +114,25 @@ async fn test_read_parquet_stats() {
    )
    .await;
 }
+
+#[tokio::test]
+async fn test_prune_tag_and_field() {
+    common_telemetry::init_default_ut_logging();
+    // prune result: only row group 1
+    check_prune_row_groups(
+        col("tag_0")
+            .gt(lit(ScalarValue::Utf8(Some("4".to_string()))))
+            .and(col("field_0").lt(lit(8.0))),
+        "\
+-------+---------+---------------------+
+| tag_0 | field_0 | ts                  |
+-------+---------+---------------------+
+| 5     | 5.0     | 1970-01-01T00:00:05 |
+| 6     | 6.0     | 1970-01-01T00:00:06 |
+| 7     | 7.0     | 1970-01-01T00:00:07 |
+| 8     | 8.0     | 1970-01-01T00:00:08 |
+| 9     | 9.0     | 1970-01-01T00:00:09 |
+-------+---------+---------------------+",
+    )
+    .await;
+}
--- a/src/mito2/src/manifest/manager.rs
+++ b/src/mito2/src/manifest/manager.rs
@@ -154,6 +154,12 @@ impl RegionManifestManager {
        let inner = self.inner.read().await;
        inner.store.clone()
    }
+
+    /// Returns total manifest size.
+    pub async fn manifest_size(&self) -> u64 {
+        let inner = self.inner.read().await;
+        inner.total_manifest_size()
+    }
 }

 #[cfg(test)]
@@ -186,7 +192,7 @@ impl RegionManifestManagerInner {
    /// Creates a new manifest.
    async fn new(metadata: RegionMetadataRef, options: RegionManifestOptions) -> Result<Self> {
        // construct storage
-        let store = ManifestObjectStore::new(
+        let mut store = ManifestObjectStore::new(
            &options.manifest_dir,
            options.object_store.clone(),
            options.compress_type,
@@ -232,7 +238,7 @@ impl RegionManifestManagerInner {
    /// Returns `Ok(None)` if no such manifest.
    async fn open(options: RegionManifestOptions) -> Result<Option<Self>> {
        // construct storage
-        let store = ManifestObjectStore::new(
+        let mut store = ManifestObjectStore::new(
            &options.manifest_dir,
            options.object_store.clone(),
            options.compress_type,
@@ -240,8 +246,9 @@ impl RegionManifestManagerInner {

        // recover from storage
        // construct manifest builder
+        // calculate the manifest size from the latest checkpoint
        let mut version = MIN_VERSION;
-        let checkpoint = Self::last_checkpoint(&store).await?;
+        let checkpoint = Self::last_checkpoint(&mut store).await?;
        let last_checkpoint_version = checkpoint
            .as_ref()
            .map(|checkpoint| checkpoint.last_version)
@@ -265,6 +272,8 @@ impl RegionManifestManagerInner {
        let mut action_iter = store.scan(version, MAX_VERSION).await?;
        while let Some((manifest_version, raw_action_list)) = action_iter.next_log().await? {
            let action_list = RegionMetaActionList::decode(&raw_action_list)?;
+            // set manifest size after last checkpoint
+            store.set_delta_file_size(manifest_version, raw_action_list.len() as u64);
            for action in action_list.actions {
                match action {
                    RegionMetaAction::Change(action) => {
@@ -312,6 +321,7 @@ impl RegionManifestManagerInner {
        Ok(())
    }

+    /// Update the manifest. Return the current manifest version number.
    async fn update(&mut self, action_list: RegionMetaActionList) -> Result<ManifestVersion> {
        let version = self.increase_version();
        self.store.save(version, &action_list.encode()?).await?;
@@ -343,6 +353,11 @@ impl RegionManifestManagerInner {

        Ok(version)
    }
+
+    /// Returns total manifest size.
+    pub(crate) fn total_manifest_size(&self) -> u64 {
+        self.store.total_manifest_size()
+    }
 }

 impl RegionManifestManagerInner {
@@ -369,8 +384,8 @@ impl RegionManifestManagerInner {
    }

    /// Make a new checkpoint. Return the fresh one if there are some actions to compact.
-    async fn do_checkpoint(&self) -> Result<Option<RegionCheckpoint>> {
-        let last_checkpoint = Self::last_checkpoint(&self.store).await?;
+    async fn do_checkpoint(&mut self) -> Result<Option<RegionCheckpoint>> {
+        let last_checkpoint = Self::last_checkpoint(&mut self.store).await?;
        let current_version = self.last_version;

        let (start_version, mut manifest_builder) = if let Some(checkpoint) = last_checkpoint {
@@ -441,7 +456,7 @@ impl RegionManifestManagerInner {

    /// Fetch the last [RegionCheckpoint] from storage.
    pub(crate) async fn last_checkpoint(
-        store: &ManifestObjectStore,
+        store: &mut ManifestObjectStore,
    ) -> Result<Option<RegionCheckpoint>> {
        let last_checkpoint = store.load_last_checkpoint().await?;

@@ -456,14 +471,16 @@ impl RegionManifestManagerInner {

 #[cfg(test)]
 mod test {
+
    use api::v1::SemanticType;
    use common_datasource::compression::CompressionType;
+    use common_test_util::temp_dir::create_temp_dir;
    use datatypes::prelude::ConcreteDataType;
    use datatypes::schema::ColumnSchema;
    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};

    use super::*;
-    use crate::manifest::action::RegionChange;
+    use crate::manifest::action::{RegionChange, RegionEdit};
    use crate::manifest::tests::utils::basic_region_metadata;
    use crate::test_util::TestEnv;

@@ -546,4 +563,95 @@ mod test {
            .unwrap();
        manager.validate_manifest(&new_metadata, 1).await;
    }
+
+    /// Just for test, refer to wal_dir_usage in src/store-api/src/logstore.rs.
+    async fn manifest_dir_usage(path: &str) -> u64 {
+        let mut size = 0;
+        let mut read_dir = tokio::fs::read_dir(path).await.unwrap();
+        while let Ok(dir_entry) = read_dir.next_entry().await {
+            let Some(entry) = dir_entry else {
+                break;
+            };
+            if entry.file_type().await.unwrap().is_file() {
+                let file_name = entry.file_name().into_string().unwrap();
+                if file_name.contains(".checkpoint") || file_name.contains(".json") {
+                    let file_size = entry.metadata().await.unwrap().len() as usize;
+                    debug!("File: {file_name:?}, size: {file_size}");
+                    size += file_size;
+                }
+            }
+        }
+        size as u64
+    }
+
+    #[tokio::test]
+    async fn test_manifest_size() {
+        let metadata = Arc::new(basic_region_metadata());
+        let data_home = create_temp_dir("");
+        let data_home_path = data_home.path().to_str().unwrap().to_string();
+        let env = TestEnv::with_data_home(data_home);
+
+        let manifest_dir = format!("{}/manifest", data_home_path);
+
+        let manager = env
+            .create_manifest_manager(CompressionType::Uncompressed, 10, Some(metadata.clone()))
+            .await
+            .unwrap()
+            .unwrap();
+
+        let mut new_metadata_builder = RegionMetadataBuilder::from_existing((*metadata).clone());
+        new_metadata_builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new("val2", ConcreteDataType::float64_datatype(), false),
+            semantic_type: SemanticType::Field,
+            column_id: 252,
+        });
+        let new_metadata = Arc::new(new_metadata_builder.build().unwrap());
+
+        let action_list =
+            RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
+                metadata: new_metadata.clone(),
+            }));
+
+        let current_version = manager.update(action_list).await.unwrap();
+        assert_eq!(current_version, 1);
+        manager.validate_manifest(&new_metadata, 1).await;
+
+        // get manifest size
+        let manifest_size = manager.manifest_size().await;
+        assert_eq!(manifest_size, manifest_dir_usage(&manifest_dir).await);
+
+        // update 10 times nop_action to trigger checkpoint
+        for _ in 0..10 {
+            manager
+                .update(RegionMetaActionList::new(vec![RegionMetaAction::Edit(
+                    RegionEdit {
+                        files_to_add: vec![],
+                        files_to_remove: vec![],
+                        compaction_time_window: None,
+                        flushed_entry_id: None,
+                        flushed_sequence: None,
+                    },
+                )]))
+                .await
+                .unwrap();
+        }
+
+        // check manifest size again
+        let manifest_size = manager.manifest_size().await;
+        assert_eq!(manifest_size, manifest_dir_usage(&manifest_dir).await);
+
+        // Reopen the manager,
+        // we just calculate the size from the latest checkpoint file
+        manager.stop().await.unwrap();
+        let manager = env
+            .create_manifest_manager(CompressionType::Uncompressed, 10, None)
+            .await
+            .unwrap()
+            .unwrap();
+        manager.validate_manifest(&new_metadata, 11).await;
+
+        // get manifest size again
+        let manifest_size = manager.manifest_size().await;
+        assert_eq!(manifest_size, 1312);
+    }
 }
--- a/src/mito2/src/manifest/storage.rs
+++ b/src/mito2/src/manifest/storage.rs
@@ -129,11 +129,22 @@ impl ObjectStoreLogIterator {
    }
 }

+/// Key to identify a manifest file.
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
+enum FileKey {
+    /// A delta file (`.json`).
+    Delta(ManifestVersion),
+    /// A checkpoint file (`.checkpoint`).
+    Checkpoint(ManifestVersion),
+}
+
 #[derive(Clone, Debug)]
 pub struct ManifestObjectStore {
    object_store: ObjectStore,
    compress_type: CompressionType,
    path: String,
+    /// Stores the size of each manifest file.
+    manifest_size_map: HashMap<FileKey, u64>,
 }

 impl ManifestObjectStore {
@@ -142,6 +153,7 @@ impl ManifestObjectStore {
            object_store,
            compress_type,
            path: util::normalize_dir(path),
+            manifest_size_map: HashMap::new(),
        }
    }

@@ -184,6 +196,7 @@ impl ManifestObjectStore {
            .context(OpenDalSnafu)
    }

+    /// Scan the manifest files in the range of [start, end) and return the iterator.
    pub async fn scan(
        &self,
        start: ManifestVersion,
@@ -212,8 +225,12 @@ impl ManifestObjectStore {
        })
    }

+    /// Delete manifest files that version < end.
+    /// If keep_last_checkpoint is true, the last checkpoint file will be kept.
+    /// ### Return
+    /// The number of deleted files.
    pub async fn delete_until(
-        &self,
+        &mut self,
        end: ManifestVersion,
        keep_last_checkpoint: bool,
    ) -> Result<usize> {
@@ -248,7 +265,7 @@ impl ManifestObjectStore {
        } else {
            None
        };
-        let paths: Vec<_> = entries
+        let del_entries: Vec<_> = entries
            .iter()
            .filter(|(_e, is_checkpoint, version)| {
                if let Some(max_version) = checkpoint_version {
@@ -264,12 +281,15 @@ impl ManifestObjectStore {
                    true
                }
            })
-            .map(|e| e.0.path().to_string())
            .collect();
+        let paths = del_entries
+            .iter()
+            .map(|(e, _, _)| e.path().to_string())
+            .collect::<Vec<_>>();
        let ret = paths.len();

        debug!(
-            "Deleting {} logs from manifest storage path {} until {}, checkpoint: {:?}, paths: {:?}",
+            "Deleting {} logs from manifest storage path {} until {}, checkpoint_version: {:?}, paths: {:?}",
            ret,
            self.path,
            end,
@@ -282,10 +302,21 @@ impl ManifestObjectStore {
            .await
            .context(OpenDalSnafu)?;

+        // delete manifest sizes
+        for (_, is_checkpoint, version) in &del_entries {
+            if *is_checkpoint {
+                self.manifest_size_map
+                    .remove(&FileKey::Checkpoint(*version));
+            } else {
+                self.manifest_size_map.remove(&FileKey::Delta(*version));
+            }
+        }
+
        Ok(ret)
    }

-    pub async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
+    /// Save the delta manifest file.
+    pub async fn save(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
        let path = self.delta_file_path(version);
        debug!("Save log to manifest storage, version: {}", version);
        let data = self
@@ -296,13 +327,17 @@ impl ManifestObjectStore {
                compress_type: self.compress_type,
                path: &path,
            })?;
+        let delta_size = data.len();
        self.object_store
            .write(&path, data)
            .await
-            .context(OpenDalSnafu)
+            .context(OpenDalSnafu)?;
+        self.set_delta_file_size(version, delta_size as u64);
+        Ok(())
    }

-    pub async fn save_checkpoint(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
+    /// Save the checkpoint manifest file.
+    pub async fn save_checkpoint(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
        let path = self.checkpoint_file_path(version);
        let data = self
            .compress_type
@@ -312,10 +347,12 @@ impl ManifestObjectStore {
                compress_type: self.compress_type,
                path: &path,
            })?;
+        let checkpoint_size = data.len();
        self.object_store
            .write(&path, data)
            .await
            .context(OpenDalSnafu)?;
+        self.set_checkpoint_file_size(version, checkpoint_size as u64);

        // Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
        let last_checkpoint_path = self.last_checkpoint_path();
@@ -342,7 +379,7 @@ impl ManifestObjectStore {
    }

    pub async fn load_checkpoint(
-        &self,
+        &mut self,
        version: ManifestVersion,
    ) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
        let path = self.checkpoint_file_path(version);
@@ -351,12 +388,15 @@ impl ManifestObjectStore {
        let checkpoint_data =
            match self.object_store.read(&path).await {
                Ok(checkpoint) => {
+                    let checkpoint_size = checkpoint.len();
                    let decompress_data = self.compress_type.decode(checkpoint).await.context(
                        DecompressObjectSnafu {
                            compress_type: self.compress_type,
                            path,
                        },
                    )?;
+                    // set the checkpoint size
+                    self.set_checkpoint_file_size(version, checkpoint_size as u64);
                    Ok(Some(decompress_data))
                }
                Err(e) => {
@@ -373,6 +413,7 @@ impl ManifestObjectStore {
                            );
                            match self.object_store.read(&fall_back_path).await {
                                Ok(checkpoint) => {
+                                    let checkpoint_size = checkpoint.len();
                                    let decompress_data = FALL_BACK_COMPRESS_TYPE
                                        .decode(checkpoint)
                                        .await
@@ -380,6 +421,7 @@ impl ManifestObjectStore {
                                            compress_type: FALL_BACK_COMPRESS_TYPE,
                                            path,
                                        })?;
+                                    self.set_checkpoint_file_size(version, checkpoint_size as u64);
                                    Ok(Some(decompress_data))
                                }
                                Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
@@ -398,7 +440,7 @@ impl ManifestObjectStore {

    /// Load the latest checkpoint.
    /// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
-    pub async fn load_last_checkpoint(&self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
+    pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
        let last_checkpoint_path = self.last_checkpoint_path();
        let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
            Ok(data) => data,
@@ -424,6 +466,22 @@ impl ManifestObjectStore {
    pub async fn read_file(&self, path: &str) -> Result<Vec<u8>> {
        self.object_store.read(path).await.context(OpenDalSnafu)
    }
+
+    /// Compute the size(Byte) in manifest size map.
+    pub(crate) fn total_manifest_size(&self) -> u64 {
+        self.manifest_size_map.values().sum()
+    }
+
+    /// Set the size of the delta file by delta version.
+    pub(crate) fn set_delta_file_size(&mut self, version: ManifestVersion, size: u64) {
+        self.manifest_size_map.insert(FileKey::Delta(version), size);
+    }
+
+    /// Set the size of the checkpoint file by checkpoint version.
+    pub(crate) fn set_checkpoint_file_size(&mut self, version: ManifestVersion, size: u64) {
+        self.manifest_size_map
+            .insert(FileKey::Checkpoint(version), size);
+    }
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -489,7 +547,7 @@ mod tests {
        test_manifest_log_store_case(log_store).await;
    }

-    async fn test_manifest_log_store_case(log_store: ManifestObjectStore) {
+    async fn test_manifest_log_store_case(mut log_store: ManifestObjectStore) {
        for v in 0..5 {
            log_store
                .save(v, format!("hello, {v}").as_bytes())
@@ -600,4 +658,92 @@ mod tests {
        let mut it = log_store.scan(0, 10).await.unwrap();
        assert!(it.next_log().await.unwrap().is_none());
    }
+
+    #[tokio::test]
+    async fn test_file_version() {
+        let version = file_version("00000000000000000007.checkpoint");
+        assert_eq!(version, 7);
+
+        let name = delta_file(version);
+        assert_eq!(name, "00000000000000000007.json");
+
+        let name = checkpoint_file(version);
+        assert_eq!(name, "00000000000000000007.checkpoint");
+    }
+
+    #[tokio::test]
+    async fn test_uncompressed_manifest_files_size() {
+        let mut log_store = new_test_manifest_store();
+        // write 5 manifest files with uncompressed（8B per file）
+        log_store.compress_type = CompressionType::Uncompressed;
+        for v in 0..5 {
+            log_store
+                .save(v, format!("hello, {v}").as_bytes())
+                .await
+                .unwrap();
+        }
+        // write 1 checkpoint file with uncompressed（23B）
+        log_store
+            .save_checkpoint(5, "checkpoint_uncompressed".as_bytes())
+            .await
+            .unwrap();
+
+        // manifest files size
+        assert_eq!(log_store.total_manifest_size(), 63);
+
+        // delete 3 manifest files
+        assert_eq!(log_store.delete_until(3, false).await.unwrap(), 3);
+
+        // manifest files size after delete
+        assert_eq!(log_store.total_manifest_size(), 39);
+
+        // delete all manifest files
+        assert_eq!(
+            log_store
+                .delete_until(ManifestVersion::MAX, false)
+                .await
+                .unwrap(),
+            3
+        );
+
+        assert_eq!(log_store.total_manifest_size(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_compressed_manifest_files_size() {
+        let mut log_store = new_test_manifest_store();
+        // Test with compressed manifest files
+        log_store.compress_type = CompressionType::Gzip;
+        // write 5 manifest files
+        for v in 0..5 {
+            log_store
+                .save(v, format!("hello, {v}").as_bytes())
+                .await
+                .unwrap();
+        }
+        log_store
+            .save_checkpoint(5, "checkpoint_compressed".as_bytes())
+            .await
+            .unwrap();
+
+        // manifest files size
+        assert_eq!(log_store.total_manifest_size(), 181);
+
+        // delete 3 manifest files
+        assert_eq!(log_store.delete_until(3, false).await.unwrap(), 3);
+
+        // manifest files size after delete
+        assert_eq!(log_store.total_manifest_size(), 97);
+
+        // delete all manifest files
+        assert_eq!(
+            log_store
+                .delete_until(ManifestVersion::MAX, false)
+                .await
+                .unwrap(),
+            3
+        );
+
+        assert_eq!(log_store.total_manifest_size(), 0);
+    }
 }
--- a/src/mito2/src/manifest/tests/checkpoint.rs
+++ b/src/mito2/src/manifest/tests/checkpoint.rs
@@ -202,7 +202,7 @@ async fn generate_checkpoint_with_compression_types(
        manager.update(action).await.unwrap();
    }

-    RegionManifestManagerInner::last_checkpoint(&manager.store().await)
+    RegionManifestManagerInner::last_checkpoint(&mut manager.store().await)
        .await
        .unwrap()
        .unwrap()
--- a/src/mito2/src/memtable/time_series.rs
+++ b/src/mito2/src/memtable/time_series.rs
@@ -20,8 +20,11 @@ use std::sync::{Arc, RwLock};

 use api::v1::OpType;
 use common_telemetry::debug;
+use datafusion::physical_plan::PhysicalExpr;
+use datafusion_common::ScalarValue;
+use datafusion_expr::ColumnarValue;
 use datatypes::arrow;
-use datatypes::arrow::array::ArrayRef;
+use datatypes::arrow::array::{ArrayRef, BooleanArray};
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::data_type::DataType;
 use datatypes::prelude::{MutableVector, ScalarVectorBuilder, Vector, VectorRef};
@@ -300,12 +303,16 @@ impl SeriesSet {
        let (primary_key_builders, primary_key_schema) =
            primary_key_builders(&self.region_metadata, 1);

+        let physical_exprs: Vec<_> = predicate
+            .and_then(|p| p.to_physical_exprs(&primary_key_schema).ok())
+            .unwrap_or_default();
+
        Iter {
            metadata: self.region_metadata.clone(),
            series: self.series.clone(),
            projection,
            last_key: None,
-            predicate,
+            predicate: physical_exprs,
            pk_schema: primary_key_schema,
            primary_key_builders,
            codec: self.codec.clone(),
@@ -341,7 +348,7 @@ struct Iter {
    series: Arc<SeriesRwLockMap>,
    projection: HashSet<ColumnId>,
    last_key: Option<Vec<u8>>,
-    predicate: Option<Predicate>,
+    predicate: Vec<Arc<dyn PhysicalExpr>>,
    pk_schema: arrow::datatypes::SchemaRef,
    primary_key_builders: Vec<Box<dyn MutableVector>>,
    codec: Arc<McmpRowCodec>,
@@ -362,18 +369,18 @@ impl Iterator for Iter {
        // TODO(hl): maybe yield more than one time series to amortize range overhead.
        for (primary_key, series) in range {
            let mut series = series.write().unwrap();
-            if let Some(predicate) = &self.predicate {
-                if !prune_primary_key(
+            if !self.predicate.is_empty()
+                && !prune_primary_key(
                    &self.codec,
                    primary_key.as_slice(),
                    &mut series,
                    &mut self.primary_key_builders,
                    self.pk_schema.clone(),
-                    predicate,
-                ) {
-                    // read next series
-                    continue;
-                }
+                    &self.predicate,
+                )
+            {
+                // read next series
+                continue;
            }
            self.last_key = Some(primary_key.clone());

@@ -392,7 +399,7 @@ fn prune_primary_key(
    series: &mut Series,
    builders: &mut Vec<Box<dyn MutableVector>>,
    pk_schema: arrow::datatypes::SchemaRef,
-    predicate: &Predicate,
+    predicate: &[Arc<dyn PhysicalExpr>],
 ) -> bool {
    // no primary key, we simply return true.
    if pk_schema.fields().is_empty() {
@@ -400,20 +407,52 @@ fn prune_primary_key(
    }

    if let Some(rb) = series.pk_cache.as_ref() {
-        let res = predicate.prune_primary_key(rb).unwrap_or(true);
+        let res = prune_inner(predicate, rb).unwrap_or(true);
        debug!("Prune primary key: {:?}, res: {:?}", rb, res);
        res
    } else {
        let Ok(rb) = pk_to_record_batch(codec, pk, builders, pk_schema) else {
            return true;
        };
-        let res = predicate.prune_primary_key(&rb).unwrap_or(true);
+        let res = prune_inner(predicate, &rb).unwrap_or(true);
        debug!("Prune primary key: {:?}, res: {:?}", rb, res);
        series.update_pk_cache(rb);
        res
    }
 }

+fn prune_inner(predicates: &[Arc<dyn PhysicalExpr>], primary_key: &RecordBatch) -> Result<bool> {
+    for expr in predicates {
+        // evaluate every filter against primary key
+        let Ok(eva) = expr.evaluate(primary_key) else {
+            continue;
+        };
+        let result = match eva {
+            ColumnarValue::Array(array) => {
+                let predicate_array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
+                predicate_array
+                    .into_iter()
+                    .map(|x| x.unwrap_or(true))
+                    .next()
+                    .unwrap_or(true)
+            }
+            // result was a column
+            ColumnarValue::Scalar(ScalarValue::Boolean(v)) => v.unwrap_or(true),
+            _ => {
+                unreachable!("Unexpected primary key record batch evaluation result: {:?}, primary key: {:?}", eva, primary_key);
+            }
+        };
+        debug!(
+            "Evaluate primary key {:?} against filter: {:?}, result: {:?}",
+            primary_key, expr, result
+        );
+        if !result {
+            return Ok(false);
+        }
+    }
+    Ok(true)
+}
+
 fn pk_to_record_batch(
    codec: &Arc<McmpRowCodec>,
    bytes: &[u8],
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -17,13 +17,12 @@
 use common_recordbatch::SendableRecordBatchStream;
 use common_telemetry::debug;
 use common_time::range::TimestampRange;
-use snafu::ResultExt;
 use store_api::storage::ScanRequest;
 use table::predicate::{Predicate, TimeRangePredicateBuilder};

 use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheManagerRef;
-use crate::error::{BuildPredicateSnafu, Result};
+use crate::error::Result;
 use crate::read::projection::ProjectionMapper;
 use crate::read::seq_scan::SeqScan;
 use crate::region::version::VersionRef;
@@ -173,11 +172,7 @@ impl ScanRegion {
            total_ssts
        );

-        let predicate = Predicate::try_new(
-            self.request.filters.clone(),
-            self.version.metadata.schema.clone(),
-        )
-        .context(BuildPredicateSnafu)?;
+        let predicate = Predicate::new(self.request.filters.clone());
        let mapper = match &self.request.projection {
            Some(p) => ProjectionMapper::new(&self.version.metadata, p.iter().copied())?,
            None => ProjectionMapper::all(&self.version.metadata)?,
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -119,7 +119,8 @@ impl RegionOpener {
                    &expect.column_metadatas,
                    &expect.primary_key,
                )?;
-
+                // To keep consistence with Create behavior, set the opened Region writable.
+                region.set_writable(true);
                return Ok(region);
            }
            Ok(None) => {
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -16,6 +16,7 @@

 mod format;
 pub mod reader;
+pub mod row_group;
 mod stats;
 pub mod writer;

--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -188,8 +188,9 @@ impl ParquetReaderBuilder {
                &read_format,
                column_ids,
            );
+
            let pruned_row_groups = predicate
-                .prune_with_stats(&stats)
+                .prune_with_stats(&stats, read_format.metadata().schema.arrow_schema())
                .into_iter()
                .enumerate()
                .filter_map(|(idx, valid)| if valid { Some(idx) } else { None })
--- a/src/mito2/src/sst/parquet/row_group.rs
+++ b/src/mito2/src/sst/parquet/row_group.rs
@@ -0,0 +1,230 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Ports private structs from [parquet crate](https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/arrow/async_reader/mod.rs#L644-L650).
+
+use std::sync::Arc;
+
+use bytes::{Buf, Bytes};
+use parquet::arrow::arrow_reader::{RowGroups, RowSelection};
+use parquet::arrow::async_reader::AsyncFileReader;
+use parquet::arrow::ProjectionMask;
+use parquet::column::page::{PageIterator, PageReader};
+use parquet::errors::{ParquetError, Result};
+use parquet::file::metadata::RowGroupMetaData;
+use parquet::file::reader::{ChunkReader, Length};
+use parquet::file::serialized_reader::SerializedPageReader;
+use parquet::format::PageLocation;
+
+/// An in-memory collection of column chunks
+pub struct InMemoryRowGroup<'a> {
+    metadata: &'a RowGroupMetaData,
+    page_locations: Option<&'a [Vec<PageLocation>]>,
+    column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
+    row_count: usize,
+}
+
+impl<'a> InMemoryRowGroup<'a> {
+    /// Fetches the necessary column data into memory
+    // TODO(yingwen): Fix clippy warnings.
+    #[allow(clippy::filter_map_bool_then)]
+    #[allow(clippy::useless_conversion)]
+    pub async fn fetch<T: AsyncFileReader + Send>(
+        &mut self,
+        input: &mut T,
+        projection: &ProjectionMask,
+        selection: Option<&RowSelection>,
+    ) -> Result<()> {
+        if let Some((selection, page_locations)) = selection.zip(self.page_locations) {
+            // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the
+            // `RowSelection`
+            let mut page_start_offsets: Vec<Vec<usize>> = vec![];
+
+            let fetch_ranges = self
+                .column_chunks
+                .iter()
+                .zip(self.metadata.columns())
+                .enumerate()
+                .filter_map(|(idx, (chunk, chunk_meta))| {
+                    (chunk.is_none() && projection.leaf_included(idx)).then(|| {
+                        // If the first page does not start at the beginning of the column,
+                        // then we need to also fetch a dictionary page.
+                        let mut ranges = vec![];
+                        let (start, _len) = chunk_meta.byte_range();
+                        match page_locations[idx].first() {
+                            Some(first) if first.offset as u64 != start => {
+                                ranges.push(start as usize..first.offset as usize);
+                            }
+                            _ => (),
+                        }
+
+                        ranges.extend(selection.scan_ranges(&page_locations[idx]));
+                        page_start_offsets.push(ranges.iter().map(|range| range.start).collect());
+
+                        ranges
+                    })
+                })
+                .flatten()
+                .collect();
+
+            let mut chunk_data = input.get_byte_ranges(fetch_ranges).await?.into_iter();
+            let mut page_start_offsets = page_start_offsets.into_iter();
+
+            for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
+                if chunk.is_some() || !projection.leaf_included(idx) {
+                    continue;
+                }
+
+                if let Some(offsets) = page_start_offsets.next() {
+                    let mut chunks = Vec::with_capacity(offsets.len());
+                    for _ in 0..offsets.len() {
+                        chunks.push(chunk_data.next().unwrap());
+                    }
+
+                    *chunk = Some(Arc::new(ColumnChunkData::Sparse {
+                        length: self.metadata.column(idx).byte_range().1 as usize,
+                        data: offsets.into_iter().zip(chunks.into_iter()).collect(),
+                    }))
+                }
+            }
+        } else {
+            let fetch_ranges = self
+                .column_chunks
+                .iter()
+                .enumerate()
+                .filter_map(|(idx, chunk)| {
+                    (chunk.is_none() && projection.leaf_included(idx)).then(|| {
+                        let column = self.metadata.column(idx);
+                        let (start, length) = column.byte_range();
+                        start as usize..(start + length) as usize
+                    })
+                })
+                .collect();
+
+            let mut chunk_data = input.get_byte_ranges(fetch_ranges).await?.into_iter();
+
+            for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
+                if chunk.is_some() || !projection.leaf_included(idx) {
+                    continue;
+                }
+
+                if let Some(data) = chunk_data.next() {
+                    *chunk = Some(Arc::new(ColumnChunkData::Dense {
+                        offset: self.metadata.column(idx).byte_range().0 as usize,
+                        data,
+                    }));
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<'a> RowGroups for InMemoryRowGroup<'a> {
+    fn num_rows(&self) -> usize {
+        self.row_count
+    }
+
+    fn column_chunks(&self, i: usize) -> Result<Box<dyn PageIterator>> {
+        match &self.column_chunks[i] {
+            None => Err(ParquetError::General(format!(
+                "Invalid column index {i}, column was not fetched"
+            ))),
+            Some(data) => {
+                let page_locations = self.page_locations.map(|index| index[i].clone());
+                let page_reader: Box<dyn PageReader> = Box::new(SerializedPageReader::new(
+                    data.clone(),
+                    self.metadata.column(i),
+                    self.row_count,
+                    page_locations,
+                )?);
+
+                Ok(Box::new(ColumnChunkIterator {
+                    reader: Some(Ok(page_reader)),
+                }))
+            }
+        }
+    }
+}
+
+/// An in-memory column chunk
+#[derive(Clone)]
+enum ColumnChunkData {
+    /// Column chunk data representing only a subset of data pages
+    Sparse {
+        /// Length of the full column chunk
+        length: usize,
+        /// Set of data pages included in this sparse chunk. Each element is a tuple
+        /// of (page offset, page data)
+        data: Vec<(usize, Bytes)>,
+    },
+    /// Full column chunk and its offset
+    Dense { offset: usize, data: Bytes },
+}
+
+impl ColumnChunkData {
+    fn get(&self, start: u64) -> Result<Bytes> {
+        match &self {
+            ColumnChunkData::Sparse { data, .. } => data
+                .binary_search_by_key(&start, |(offset, _)| *offset as u64)
+                .map(|idx| data[idx].1.clone())
+                .map_err(|_| {
+                    ParquetError::General(format!(
+                        "Invalid offset in sparse column chunk data: {start}"
+                    ))
+                }),
+            ColumnChunkData::Dense { offset, data } => {
+                let start = start as usize - *offset;
+                Ok(data.slice(start..))
+            }
+        }
+    }
+}
+
+impl Length for ColumnChunkData {
+    fn len(&self) -> u64 {
+        match &self {
+            ColumnChunkData::Sparse { length, .. } => *length as u64,
+            ColumnChunkData::Dense { data, .. } => data.len() as u64,
+        }
+    }
+}
+
+impl ChunkReader for ColumnChunkData {
+    type T = bytes::buf::Reader<Bytes>;
+
+    fn get_read(&self, start: u64) -> Result<Self::T> {
+        Ok(self.get(start)?.reader())
+    }
+
+    fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
+        Ok(self.get(start)?.slice(..length))
+    }
+}
+
+/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`]
+struct ColumnChunkIterator {
+    reader: Option<Result<Box<dyn PageReader>>>,
+}
+
+impl Iterator for ColumnChunkIterator {
+    type Item = Result<Box<dyn PageReader>>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.reader.take()
+    }
+}
+
+impl PageIterator for ColumnChunkIterator {}
--- a/src/mito2/src/test_util.rs
+++ b/src/mito2/src/test_util.rs
@@ -99,6 +99,15 @@ impl TestEnv {
        }
    }

+    /// Returns a new env with specific `data_home` for test.
+    pub fn with_data_home(data_home: TempDir) -> TestEnv {
+        TestEnv {
+            data_home,
+            logstore: None,
+            object_store: None,
+        }
+    }
+
    pub fn get_logstore(&self) -> Option<Arc<RaftEngineLogStore>> {
        self.logstore.clone()
    }
--- a/src/object-store/Cargo.toml
+++ b/src/object-store/Cargo.toml
@@ -7,6 +7,8 @@ license.workspace = true
 [dependencies]
 async-trait = "0.1"
 bytes = "1.4"
+common-error.workspace = true
+common-macro.workspace = true
 common-runtime.workspace = true
 common-telemetry.workspace = true
 futures.workspace = true
@@ -17,6 +19,7 @@ opendal = { version = "0.40", features = [
    "layers-tracing",
    "layers-metrics",
 ] }
+snafu.workspace = true
 uuid.workspace = true

 [dev-dependencies]
--- a/src/object-store/src/error.rs
+++ b/src/object-store/src/error.rs
@@ -0,0 +1,45 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("Default storage not found: {}", default_object_store))]
+    DefaultStorageNotFound {
+        location: Location,
+        default_object_store: String,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Error::DefaultStorageNotFound { .. } => StatusCode::InvalidArguments,
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
--- a/src/object-store/src/lib.rs
+++ b/src/object-store/src/lib.rs
@@ -19,7 +19,9 @@ pub use opendal::{
    Operator as ObjectStore, Reader, Result, Writer,
 };

+pub mod error;
 pub mod layers;
+pub mod manager;
 mod metrics;
 pub mod test_util;
 pub mod util;
--- a/src/object-store/src/manager.rs
+++ b/src/object-store/src/manager.rs
@@ -0,0 +1,107 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use snafu::OptionExt;
+
+use crate::error::{DefaultStorageNotFoundSnafu, Result};
+use crate::ObjectStore;
+
+/// Manages multiple object stores so that users can configure a storage for each table.
+/// This struct certainly have one default object store, and can have zero or more custom object stores.
+pub struct ObjectStoreManager {
+    stores: HashMap<String, ObjectStore>,
+    default_object_store: ObjectStore,
+}
+
+impl ObjectStoreManager {
+    /// Creates a new manager with specific object stores. Returns an error if `stores` doesn't contain the default object store.
+    pub fn try_new(
+        stores: HashMap<String, ObjectStore>,
+        default_object_store: &str,
+    ) -> Result<Self> {
+        let default_object_store = stores
+            .get(default_object_store)
+            .context(DefaultStorageNotFoundSnafu {
+                default_object_store,
+            })?
+            .clone();
+        Ok(ObjectStoreManager {
+            stores,
+            default_object_store,
+        })
+    }
+
+    pub fn find(&self, name: &str) -> Option<&ObjectStore> {
+        self.stores.get(name)
+    }
+
+    pub fn default_object_store(&self) -> &ObjectStore {
+        &self.default_object_store
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use common_test_util::temp_dir::{create_temp_dir, TempDir};
+
+    use super::ObjectStoreManager;
+    use crate::error::Error;
+    use crate::services::Fs as Builder;
+    use crate::ObjectStore;
+
+    fn new_object_store(dir: &TempDir) -> ObjectStore {
+        let store_dir = dir.path().to_str().unwrap();
+        let mut builder = Builder::default();
+        let _ = builder.root(store_dir);
+        ObjectStore::new(builder).unwrap().finish()
+    }
+
+    #[test]
+    fn test_new_returns_err_when_global_store_not_exist() {
+        let dir = create_temp_dir("new");
+        let object_store = new_object_store(&dir);
+        let stores: HashMap<String, ObjectStore> = vec![
+            ("File".to_string(), object_store.clone()),
+            ("S3".to_string(), object_store.clone()),
+        ]
+        .into_iter()
+        .collect();
+
+        assert!(matches!(
+            ObjectStoreManager::try_new(stores, "Gcs"),
+            Err(Error::DefaultStorageNotFound { .. })
+        ));
+    }
+
+    #[test]
+    fn test_new_returns_ok() {
+        let dir = create_temp_dir("new");
+        let object_store = new_object_store(&dir);
+        let stores: HashMap<String, ObjectStore> = vec![
+            ("File".to_string(), object_store.clone()),
+            ("S3".to_string(), object_store.clone()),
+        ]
+        .into_iter()
+        .collect();
+        let object_store_manager = ObjectStoreManager::try_new(stores, "File").unwrap();
+        assert_eq!(object_store_manager.stores.len(), 2);
+        assert!(object_store_manager.find("File").is_some());
+        assert!(object_store_manager.find("S3").is_some());
+        assert!(object_store_manager.find("Gcs").is_none());
+    }
+}
--- a/src/promql/Cargo.toml
+++ b/src/promql/Cargo.toml
@@ -12,6 +12,7 @@ catalog = { workspace = true }
 common-catalog = { workspace = true }
 common-error = { workspace = true }
 common-macro = { workspace = true }
+common-recordbatch = { workspace = true }
 common-telemetry = { workspace = true }
 datafusion.workspace = true
 datatypes = { workspace = true }
--- a/src/promql/src/extension_plan.rs
+++ b/src/promql/src/extension_plan.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 mod empty_metric;
+mod histogram_fold;
 mod instant_manipulate;
 mod normalize;
 mod planner;
--- a/src/promql/src/extension_plan/histogram_fold.rs
+++ b/src/promql/src/extension_plan/histogram_fold.rs
@@ -0,0 +1,798 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+use std::task::Poll;
+use std::time::Instant;
+
+use common_recordbatch::RecordBatch as GtRecordBatch;
+use common_telemetry::warn;
+use datafusion::arrow::array::AsArray;
+use datafusion::arrow::compute::{self, concat_batches, SortOptions};
+use datafusion::arrow::datatypes::{DataType, Field, Float64Type, SchemaRef};
+use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::common::{DFField, DFSchema, DFSchemaRef};
+use datafusion::error::{DataFusionError, Result as DataFusionResult};
+use datafusion::execution::TaskContext;
+use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion::physical_expr::{PhysicalSortExpr, PhysicalSortRequirement};
+use datafusion::physical_plan::expressions::Column as PhyColumn;
+use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr,
+    RecordBatchStream, SendableRecordBatchStream, Statistics,
+};
+use datafusion::prelude::{Column, Expr};
+use datatypes::prelude::{ConcreteDataType, DataType as GtDataType};
+use datatypes::schema::Schema as GtSchema;
+use datatypes::value::{ListValue, Value};
+use datatypes::vectors::MutableVector;
+use futures::{ready, Stream, StreamExt};
+
+/// `HistogramFold` will fold the conventional (non-native) histogram ([1]) for later
+/// computing. Specifically, it will transform the `le` and `field` column into a complex
+/// type, and samples on other tag columns:
+/// - `le` will become a [ListArray] of [f64]. With each bucket bound parsed
+/// - `field` will become a [ListArray] of [f64]
+/// - other columns will be sampled every `bucket_num` element, but their types won't change.
+///
+/// Due to the folding or sampling, the output rows number will become `input_rows` / `bucket_num`.
+///
+/// # Requirement
+/// - Input should be sorted on `<tag list>, le ASC, ts`.
+/// - The value set of `le` should be same. I.e., buckets of every series should be same.
+///
+/// [1]: https://prometheus.io/docs/concepts/metric_types/#histogram
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct HistogramFold {
+    /// Name of the `le` column. It's a special column in prometheus
+    /// for implementing conventional histogram. It's a string column
+    /// with "literal" float value, like "+Inf", "0.001" etc.
+    le_column: String,
+    ts_column: String,
+    input: LogicalPlan,
+    field_column: String,
+    output_schema: DFSchemaRef,
+}
+
+impl UserDefinedLogicalNodeCore for HistogramFold {
+    fn name(&self) -> &str {
+        Self::name()
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.output_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "HistogramFold: le={}, field={}",
+            self.le_column, self.field_column
+        )
+    }
+
+    fn from_template(&self, _exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
+        Self {
+            le_column: self.le_column.clone(),
+            ts_column: self.ts_column.clone(),
+            input: inputs[0].clone(),
+            field_column: self.field_column.clone(),
+            // This method cannot return error. Otherwise we should re-calculate
+            // the output schema
+            output_schema: self.output_schema.clone(),
+        }
+    }
+}
+
+impl HistogramFold {
+    #[allow(dead_code)]
+    pub fn new(
+        le_column: String,
+        field_column: String,
+        ts_column: String,
+        input: LogicalPlan,
+    ) -> DataFusionResult<Self> {
+        let input_schema = input.schema();
+        Self::check_schema(input_schema, &le_column, &field_column, &ts_column)?;
+        let output_schema = Self::convert_schema(input_schema, &le_column, &field_column)?;
+        Ok(Self {
+            le_column,
+            ts_column,
+            input,
+            field_column,
+            output_schema,
+        })
+    }
+
+    pub const fn name() -> &'static str {
+        "HistogramFold"
+    }
+
+    fn check_schema(
+        input_schema: &DFSchemaRef,
+        le_column: &str,
+        field_column: &str,
+        ts_column: &str,
+    ) -> DataFusionResult<()> {
+        let check_column = |col| {
+            if !input_schema.has_column_with_unqualified_name(col) {
+                return Err(DataFusionError::SchemaError(
+                    datafusion::common::SchemaError::FieldNotFound {
+                        field: Box::new(Column::new(None::<String>, col)),
+                        valid_fields: input_schema
+                            .fields()
+                            .iter()
+                            .map(|f| f.qualified_column())
+                            .collect(),
+                    },
+                ));
+            } else {
+                Ok(())
+            }
+        };
+
+        check_column(le_column)?;
+        check_column(ts_column)?;
+        check_column(field_column)
+    }
+
+    #[allow(dead_code)]
+    pub fn to_execution_plan(&self, exec_input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
+        let input_schema = self.input.schema();
+        // safety: those fields are checked in `check_schema()`
+        let le_column_index = input_schema
+            .index_of_column_by_name(None, &self.le_column)
+            .unwrap()
+            .unwrap();
+        let field_column_index = input_schema
+            .index_of_column_by_name(None, &self.field_column)
+            .unwrap()
+            .unwrap();
+        let ts_column_index = input_schema
+            .index_of_column_by_name(None, &self.ts_column)
+            .unwrap()
+            .unwrap();
+
+        Arc::new(HistogramFoldExec {
+            le_column_index,
+            field_column_index,
+            ts_column_index,
+            input: exec_input,
+            output_schema: Arc::new(self.output_schema.as_ref().into()),
+            metric: ExecutionPlanMetricsSet::new(),
+        })
+    }
+
+    /// Transform the schema
+    ///
+    /// - `le` will become a [ListArray] of [f64]. With each bucket bound parsed
+    /// - `field` will become a [ListArray] of [f64]
+    fn convert_schema(
+        input_schema: &DFSchemaRef,
+        le_column: &str,
+        field_column: &str,
+    ) -> DataFusionResult<DFSchemaRef> {
+        let mut fields = input_schema.fields().clone();
+        // safety: those fields are checked in `check_schema()`
+        let le_column_idx = input_schema
+            .index_of_column_by_name(None, le_column)?
+            .unwrap();
+        let field_column_idx = input_schema
+            .index_of_column_by_name(None, field_column)?
+            .unwrap();
+
+        // transform `le`
+        let le_field: Field = fields[le_column_idx].field().as_ref().clone();
+        let le_field = le_field.with_data_type(DataType::Float64);
+        let folded_le_datatype = DataType::List(Arc::new(le_field));
+        let folded_le = DFField::new(
+            fields[le_column_idx].qualifier().cloned(),
+            fields[le_column_idx].name(),
+            folded_le_datatype,
+            false,
+        );
+
+        // transform `field`
+        // to avoid ambiguity, that field will be referenced as `the_field` below.
+        let the_field: Field = fields[field_column_idx].field().as_ref().clone();
+        let folded_field_datatype = DataType::List(Arc::new(the_field));
+        let folded_field = DFField::new(
+            fields[field_column_idx].qualifier().cloned(),
+            fields[field_column_idx].name(),
+            folded_field_datatype,
+            false,
+        );
+
+        fields[le_column_idx] = folded_le;
+        fields[field_column_idx] = folded_field;
+
+        Ok(Arc::new(DFSchema::new_with_metadata(
+            fields,
+            HashMap::new(),
+        )?))
+    }
+}
+
+#[derive(Debug)]
+pub struct HistogramFoldExec {
+    /// Index for `le` column in the schema of input.
+    le_column_index: usize,
+    input: Arc<dyn ExecutionPlan>,
+    output_schema: SchemaRef,
+    /// Index for field column in the schema of input.
+    field_column_index: usize,
+    ts_column_index: usize,
+    metric: ExecutionPlanMetricsSet,
+}
+
+impl ExecutionPlan for HistogramFoldExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.output_schema.clone()
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        self.input.output_partitioning()
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
+        let mut cols = self
+            .tag_col_exprs()
+            .into_iter()
+            .map(|expr| PhysicalSortRequirement {
+                expr,
+                options: None,
+            })
+            .collect::<Vec<PhysicalSortRequirement>>();
+        // add le ASC
+        cols.push(PhysicalSortRequirement {
+            expr: Arc::new(PhyColumn::new(
+                self.output_schema.field(self.le_column_index).name(),
+                self.le_column_index,
+            )),
+            options: Some(SortOptions {
+                descending: false,  // +INF in the last
+                nulls_first: false, // not nullable
+            }),
+        });
+        // add ts
+        cols.push(PhysicalSortRequirement {
+            expr: Arc::new(PhyColumn::new(
+                self.output_schema.field(self.ts_column_index).name(),
+                self.ts_column_index,
+            )),
+            options: None,
+        });
+
+        vec![Some(cols)]
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        // partition on all tag columns, i.e., non-le, non-ts and non-field columns
+        vec![Distribution::HashPartitioned(self.tag_col_exprs())]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true; self.children().len()]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![self.input.clone()]
+    }
+
+    // cannot change schema with this method
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
+        assert!(!children.is_empty());
+        Ok(Arc::new(Self {
+            input: children[0].clone(),
+            metric: self.metric.clone(),
+            le_column_index: self.le_column_index,
+            ts_column_index: self.ts_column_index,
+            output_schema: self.output_schema.clone(),
+            field_column_index: self.field_column_index,
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> DataFusionResult<SendableRecordBatchStream> {
+        let baseline_metric = BaselineMetrics::new(&self.metric, partition);
+
+        let batch_size = context.session_config().batch_size();
+        let input = self.input.execute(partition, context)?;
+        let output_schema = self.output_schema.clone();
+
+        let mut normal_indices = (0..output_schema.fields().len()).collect::<HashSet<_>>();
+        normal_indices.remove(&self.le_column_index);
+        normal_indices.remove(&self.field_column_index);
+        Ok(Box::pin(HistogramFoldStream {
+            le_column_index: self.le_column_index,
+            field_column_index: self.field_column_index,
+            normal_indices: normal_indices.into_iter().collect(),
+            bucket_size: None,
+            input_buffer: vec![],
+            input,
+            output_schema,
+            metric: baseline_metric,
+            batch_size,
+            input_buffered_rows: 0,
+            output_buffer: HistogramFoldStream::empty_output_buffer(&self.output_schema)?,
+            output_buffered_rows: 0,
+        }))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metric.clone_inner())
+    }
+
+    fn statistics(&self) -> Statistics {
+        Statistics {
+            num_rows: None,
+            total_byte_size: None,
+            column_statistics: None,
+            is_exact: false,
+        }
+    }
+}
+
+impl HistogramFoldExec {
+    /// Return all the [PhysicalExpr] of tag columns in order.
+    ///
+    /// Tag columns are all columns except `le`, `field` and `ts` columns.
+    pub fn tag_col_exprs(&self) -> Vec<Arc<dyn PhysicalExpr>> {
+        self.input
+            .schema()
+            .fields()
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, field)| {
+                if idx == self.le_column_index
+                    || idx == self.field_column_index
+                    || idx == self.ts_column_index
+                {
+                    None
+                } else {
+                    Some(Arc::new(PhyColumn::new(field.name(), idx)) as _)
+                }
+            })
+            .collect()
+    }
+}
+
+impl DisplayAs for HistogramFoldExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(
+                    f,
+                    "HistogramFoldExec: le=@{}, field=@{}",
+                    self.le_column_index, self.field_column_index
+                )
+            }
+        }
+    }
+}
+
+pub struct HistogramFoldStream {
+    // internal states
+    le_column_index: usize,
+    field_column_index: usize,
+    /// Columns need not folding
+    normal_indices: Vec<usize>,
+    bucket_size: Option<usize>,
+    /// Expected output batch size
+    batch_size: usize,
+    output_schema: SchemaRef,
+
+    // buffers
+    input_buffer: Vec<RecordBatch>,
+    input_buffered_rows: usize,
+    output_buffer: Vec<Box<dyn MutableVector>>,
+    output_buffered_rows: usize,
+
+    // runtime things
+    input: SendableRecordBatchStream,
+    metric: BaselineMetrics,
+}
+
+impl RecordBatchStream for HistogramFoldStream {
+    fn schema(&self) -> SchemaRef {
+        self.output_schema.clone()
+    }
+}
+
+impl Stream for HistogramFoldStream {
+    type Item = DataFusionResult<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let poll = loop {
+            match ready!(self.input.poll_next_unpin(cx)) {
+                Some(batch) => {
+                    let batch = batch?;
+                    let timer = Instant::now();
+                    let Some(result) = self.fold_input(batch)? else {
+                        self.metric.elapsed_compute().add_elapsed(timer);
+                        continue;
+                    };
+                    self.metric.elapsed_compute().add_elapsed(timer);
+                    break Poll::Ready(Some(result));
+                }
+                None => break Poll::Ready(self.take_output_buf()?.map(Ok)),
+            }
+        };
+        self.metric.record_poll(poll)
+    }
+}
+
+impl HistogramFoldStream {
+    /// The inner most `Result` is for `poll_next()`
+    pub fn fold_input(
+        &mut self,
+        input: RecordBatch,
+    ) -> DataFusionResult<Option<DataFusionResult<RecordBatch>>> {
+        let Some(bucket_num) = self.calculate_bucket_num(&input)? else {
+            return Ok(None);
+        };
+
+        if self.input_buffered_rows + input.num_rows() < bucket_num {
+            // not enough rows to fold
+            self.push_input_buf(input);
+            return Ok(None);
+        }
+
+        self.fold_buf(bucket_num, input)?;
+        if self.output_buffered_rows >= self.batch_size {
+            return Ok(self.take_output_buf()?.map(Ok));
+        }
+
+        Ok(None)
+    }
+
+    pub fn empty_output_buffer(
+        schema: &SchemaRef,
+    ) -> DataFusionResult<Vec<Box<dyn MutableVector>>> {
+        let mut builders = Vec::with_capacity(schema.fields().len());
+        for field in schema.fields() {
+            let concrete_datatype = ConcreteDataType::try_from(field.data_type()).unwrap();
+            let mutable_vector = concrete_datatype.create_mutable_vector(0);
+            builders.push(mutable_vector);
+        }
+
+        Ok(builders)
+    }
+
+    fn calculate_bucket_num(&mut self, batch: &RecordBatch) -> DataFusionResult<Option<usize>> {
+        if let Some(size) = self.bucket_size {
+            return Ok(Some(size));
+        }
+
+        let inf_pos = self.find_positive_inf(batch)?;
+        if inf_pos == batch.num_rows() {
+            // no positive inf found, append to buffer and wait for next batch
+            self.push_input_buf(batch.clone());
+            return Ok(None);
+        }
+
+        // else we found the positive inf.
+        // calculate the bucket size
+        let bucket_size = inf_pos + self.input_buffered_rows + 1;
+        Ok(Some(bucket_size))
+    }
+
+    /// Fold record batches from input buffer and put to output buffer
+    fn fold_buf(&mut self, bucket_num: usize, input: RecordBatch) -> DataFusionResult<()> {
+        self.push_input_buf(input);
+        // TODO(ruihang): this concat is avoidable.
+        let batch = concat_batches(&self.input.schema(), self.input_buffer.drain(..).as_ref())?;
+        let mut remaining_rows = self.input_buffered_rows;
+        let mut cursor = 0;
+
+        let gt_schema = GtSchema::try_from(self.input.schema()).unwrap();
+        let batch = GtRecordBatch::try_from_df_record_batch(Arc::new(gt_schema), batch).unwrap();
+
+        while remaining_rows >= bucket_num {
+            // "sample" normal columns
+            for normal_index in &self.normal_indices {
+                let val = batch.column(*normal_index).get(cursor);
+                self.output_buffer[*normal_index].push_value_ref(val.as_value_ref());
+            }
+            // "fold" `le` and field columns
+            let le_array = batch.column(self.le_column_index);
+            let field_array = batch.column(self.field_column_index);
+            let mut le_item = vec![];
+            let mut field_item = vec![];
+            for bias in 0..bucket_num {
+                let le_str_val = le_array.get(cursor + bias);
+                let le_str_val_ref = le_str_val.as_value_ref();
+                let le_str = le_str_val_ref
+                    .as_string()
+                    .unwrap()
+                    .expect("le column should not be nullable");
+                let le = le_str.parse::<f64>().unwrap();
+                let le_val = Value::from(le);
+                le_item.push(le_val);
+
+                let field = field_array.get(cursor + bias);
+                field_item.push(field);
+            }
+            let le_list_val = Value::List(ListValue::new(
+                Some(Box::new(le_item)),
+                ConcreteDataType::float64_datatype(),
+            ));
+            let field_list_val = Value::List(ListValue::new(
+                Some(Box::new(field_item)),
+                ConcreteDataType::float64_datatype(),
+            ));
+            self.output_buffer[self.le_column_index].push_value_ref(le_list_val.as_value_ref());
+            self.output_buffer[self.field_column_index]
+                .push_value_ref(field_list_val.as_value_ref());
+
+            cursor += bucket_num;
+            remaining_rows -= bucket_num;
+            self.output_buffered_rows += 1;
+        }
+
+        let remaining_input_batch = batch.into_df_record_batch().slice(cursor, remaining_rows);
+        self.input_buffered_rows = remaining_input_batch.num_rows();
+        self.input_buffer.push(remaining_input_batch);
+
+        Ok(())
+    }
+
+    fn push_input_buf(&mut self, batch: RecordBatch) {
+        self.input_buffered_rows += batch.num_rows();
+        self.input_buffer.push(batch);
+    }
+
+    fn take_output_buf(&mut self) -> DataFusionResult<Option<RecordBatch>> {
+        if self.output_buffered_rows == 0 {
+            if self.input_buffered_rows != 0 {
+                warn!(
+                    "input buffer is not empty, {} rows remaining",
+                    self.input_buffered_rows
+                );
+            }
+            return Ok(None);
+        }
+
+        let mut output_buf = Self::empty_output_buffer(&self.output_schema)?;
+        std::mem::swap(&mut self.output_buffer, &mut output_buf);
+        let mut columns = Vec::with_capacity(output_buf.len());
+        for builder in output_buf.iter_mut() {
+            columns.push(builder.to_vector().to_arrow_array());
+        }
+
+        // overwrite default list datatype to change field name
+        columns[self.le_column_index] = compute::cast(
+            &columns[self.le_column_index],
+            self.output_schema.field(self.le_column_index).data_type(),
+        )?;
+        columns[self.field_column_index] = compute::cast(
+            &columns[self.field_column_index],
+            self.output_schema
+                .field(self.field_column_index)
+                .data_type(),
+        )?;
+
+        self.output_buffered_rows = 0;
+        RecordBatch::try_new(self.output_schema.clone(), columns)
+            .map(Some)
+            .map_err(DataFusionError::ArrowError)
+    }
+
+    /// Find the first `+Inf` which indicates the end of the bucket group
+    ///
+    /// If the return value equals to batch's num_rows means the it's not found
+    /// in this batch
+    fn find_positive_inf(&self, batch: &RecordBatch) -> DataFusionResult<usize> {
+        // fuse this function. It should not be called when the
+        // bucket size is already know.
+        if let Some(bucket_size) = self.bucket_size {
+            return Ok(bucket_size);
+        }
+        let string_le_array = batch.column(self.le_column_index);
+        let float_le_array = compute::cast(&string_le_array, &DataType::Float64).map_err(|e| {
+            DataFusionError::Execution(format!(
+                "cannot cast {} array to float64 array: {:?}",
+                string_le_array.data_type(),
+                e
+            ))
+        })?;
+        let le_as_f64_array = float_le_array
+            .as_primitive_opt::<Float64Type>()
+            .ok_or_else(|| {
+                DataFusionError::Execution(format!(
+                    "expect a float64 array, but found {}",
+                    float_le_array.data_type()
+                ))
+            })?;
+        for (i, v) in le_as_f64_array.iter().enumerate() {
+            if let Some(v) = v && v == f64::INFINITY {
+                return Ok(i);
+            }
+        }
+
+        Ok(batch.num_rows())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use datafusion::arrow::array::Float64Array;
+    use datafusion::arrow::datatypes::Schema;
+    use datafusion::common::ToDFSchema;
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion::prelude::SessionContext;
+    use datatypes::arrow_array::StringArray;
+
+    use super::*;
+
+    fn prepare_test_data() -> MemoryExec {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("host", DataType::Utf8, true),
+            Field::new("le", DataType::Utf8, true),
+            Field::new("val", DataType::Float64, true),
+        ]));
+
+        // 12 items
+        let host_column_1 = Arc::new(StringArray::from(vec![
+            "host_1", "host_1", "host_1", "host_1", "host_1", "host_1", "host_1", "host_1",
+            "host_1", "host_1", "host_1", "host_1",
+        ])) as _;
+        let le_column_1 = Arc::new(StringArray::from(vec![
+            "0.001", "0.1", "10", "1000", "+Inf", "0.001", "0.1", "10", "1000", "+inf", "0.001",
+            "0.1",
+        ])) as _;
+        let val_column_1 = Arc::new(Float64Array::from(vec![
+            0_0.0, 1.0, 1.0, 5.0, 5.0, 0_0.0, 20.0, 60.0, 70.0, 100.0, 0_1.0, 1.0,
+        ])) as _;
+
+        // 2 items
+        let host_column_2 = Arc::new(StringArray::from(vec!["host_1", "host_1"])) as _;
+        let le_column_2 = Arc::new(StringArray::from(vec!["10", "1000"])) as _;
+        let val_column_2 = Arc::new(Float64Array::from(vec![1.0, 1.0])) as _;
+
+        // 11 items
+        let host_column_3 = Arc::new(StringArray::from(vec![
+            "host_1", "host_2", "host_2", "host_2", "host_2", "host_2", "host_2", "host_2",
+            "host_2", "host_2", "host_2",
+        ])) as _;
+        let le_column_3 = Arc::new(StringArray::from(vec![
+            "+INF", "0.001", "0.1", "10", "1000", "+iNf", "0.001", "0.1", "10", "1000", "+Inf",
+        ])) as _;
+        let val_column_3 = Arc::new(Float64Array::from(vec![
+            1.0, 0_0.0, 0.0, 0.0, 0.0, 0.0, 0_0.0, 1.0, 2.0, 3.0, 4.0,
+        ])) as _;
+
+        let data_1 = RecordBatch::try_new(
+            schema.clone(),
+            vec![host_column_1, le_column_1, val_column_1],
+        )
+        .unwrap();
+        let data_2 = RecordBatch::try_new(
+            schema.clone(),
+            vec![host_column_2, le_column_2, val_column_2],
+        )
+        .unwrap();
+        let data_3 = RecordBatch::try_new(
+            schema.clone(),
+            vec![host_column_3, le_column_3, val_column_3],
+        )
+        .unwrap();
+
+        MemoryExec::try_new(&[vec![data_1, data_2, data_3]], schema, None).unwrap()
+    }
+
+    #[tokio::test]
+    async fn fold_overall() {
+        let memory_exec = Arc::new(prepare_test_data());
+        let output_schema = Arc::new(
+            (*HistogramFold::convert_schema(
+                &Arc::new(memory_exec.schema().to_dfschema().unwrap()),
+                "le",
+                "val",
+            )
+            .unwrap()
+            .as_ref())
+            .clone()
+            .into(),
+        );
+        let fold_exec = Arc::new(HistogramFoldExec {
+            le_column_index: 1,
+            field_column_index: 2,
+            ts_column_index: 9999, // not exist but doesn't matter
+            input: memory_exec,
+            output_schema,
+            metric: ExecutionPlanMetricsSet::new(),
+        });
+
+        let session_context = SessionContext::default();
+        let result = datafusion::physical_plan::collect(fold_exec, session_context.task_ctx())
+            .await
+            .unwrap();
+        let result_literal = datatypes::arrow::util::pretty::pretty_format_batches(&result)
+            .unwrap()
+            .to_string();
+
+        let expected = String::from(
+            "+--------+---------------------------------+--------------------------------+
+| host   | le                              | val                            |
+--------+---------------------------------+--------------------------------+
+| host_1 | [0.001, 0.1, 10.0, 1000.0, inf] | [0.0, 1.0, 1.0, 5.0, 5.0]      |
+| host_1 | [0.001, 0.1, 10.0, 1000.0, inf] | [0.0, 20.0, 60.0, 70.0, 100.0] |
+| host_1 | [0.001, 0.1, 10.0, 1000.0, inf] | [1.0, 1.0, 1.0, 1.0, 1.0]      |
+| host_2 | [0.001, 0.1, 10.0, 1000.0, inf] | [0.0, 0.0, 0.0, 0.0, 0.0]      |
+| host_2 | [0.001, 0.1, 10.0, 1000.0, inf] | [0.0, 1.0, 2.0, 3.0, 4.0]      |
+--------+---------------------------------+--------------------------------+",
+        );
+        assert_eq!(result_literal, expected);
+    }
+
+    #[test]
+    fn confirm_schema() {
+        let input_schema = Schema::new(vec![
+            Field::new("host", DataType::Utf8, true),
+            Field::new("le", DataType::Utf8, true),
+            Field::new("val", DataType::Float64, true),
+        ])
+        .to_dfschema_ref()
+        .unwrap();
+        let expected_output_schema = Schema::new(vec![
+            Field::new("host", DataType::Utf8, true),
+            Field::new(
+                "le",
+                DataType::List(Arc::new(Field::new("le", DataType::Float64, true))),
+                false,
+            ),
+            Field::new(
+                "val",
+                DataType::List(Arc::new(Field::new("val", DataType::Float64, true))),
+                false,
+            ),
+        ])
+        .to_dfschema_ref()
+        .unwrap();
+
+        let actual = HistogramFold::convert_schema(&input_schema, "le", "val").unwrap();
+        assert_eq!(actual, expected_output_schema)
+    }
+}
--- a/src/promql/src/planner.rs
+++ b/src/promql/src/planner.rs
@@ -61,6 +61,8 @@ use crate::functions::{

 /// `time()` function in PromQL.
 const SPECIAL_TIME_FUNCTION: &str = "time";
+/// `histogram_quantile` function in PromQL
+const SPECIAL_HISTOGRAM_QUANTILE: &str = "histogram_quantile";

 const DEFAULT_TIME_INDEX_COLUMN: &str = "time";

@@ -440,6 +442,10 @@ impl PromPlanner {
                    }));
                }

+                if func.name == SPECIAL_HISTOGRAM_QUANTILE {
+                    todo!()
+                }
+
                let args = self.create_function_args(&args.args)?;
                let input = self
                    .prom_expr_to_plan(args.input.with_context(|| ExpectExprSnafu {
--- a/src/query/src/error.rs
+++ b/src/query/src/error.rs
@@ -274,6 +274,9 @@ pub enum Error {

    #[snafu(display("Missing table mutation handler"))]
    MissingTableMutationHandler { location: Location },
+
+    #[snafu(display("Range Query: {}", msg))]
+    RangeQuery { msg: String, location: Location },
 }

 impl ErrorExt for Error {
@@ -281,7 +284,9 @@ impl ErrorExt for Error {
        use Error::*;

        match self {
-            QueryParse { .. } | MultipleStatements { .. } => StatusCode::InvalidSyntax,
+            QueryParse { .. } | MultipleStatements { .. } | RangeQuery { .. } => {
+                StatusCode::InvalidSyntax
+            }
            UnsupportedExpr { .. }
            | Unimplemented { .. }
            | CatalogNotFound { .. }
@@ -311,9 +316,13 @@ impl ErrorExt for Error {
            ParseSql { source, .. } => source.status_code(),
            CreateRecordBatch { source, .. } => source.status_code(),
            QueryExecution { source, .. } | QueryPlan { source, .. } => source.status_code(),
-            DataFusion { .. } | MissingTimestampColumn { .. } | RoutePartition { .. } => {
-                StatusCode::Internal
-            }
+            DataFusion { error, .. } => match error {
+                DataFusionError::Internal(_) => StatusCode::Internal,
+                DataFusionError::NotImplemented(_) => StatusCode::Unsupported,
+                DataFusionError::Plan(_) => StatusCode::PlanQuery,
+                _ => StatusCode::EngineExecuteQuery,
+            },
+            MissingTimestampColumn { .. } | RoutePartition { .. } => StatusCode::EngineExecuteQuery,
            Sql { source, .. } => source.status_code(),
            PlanSql { .. } => StatusCode::PlanQuery,
            ConvertSqlType { source, .. } | ConvertSqlValue { source, .. } => source.status_code(),
--- a/src/query/src/planner.rs
+++ b/src/query/src/planner.rs
@@ -79,7 +79,7 @@ impl DfLogicalPlanner {
        let result = sql_to_rel
            .statement_to_plan(df_stmt)
            .context(PlanSqlSnafu)?;
-        let plan = RangePlanRewriter::new(table_provider, context_provider)
+        let plan = RangePlanRewriter::new(table_provider)
            .rewrite(result)
            .await?;
        Ok(LogicalPlan::DfPlan(plan))
--- a/src/query/src/range_select/plan.rs
+++ b/src/query/src/range_select/plan.rs
@@ -21,7 +21,7 @@ use std::task::{Context, Poll};
 use std::time::Duration;

 use ahash::RandomState;
-use arrow::compute;
+use arrow::compute::{self, cast_with_options, CastOptions};
 use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit};
 use common_query::DfPhysicalPlan;
 use common_recordbatch::DfSendableRecordBatchStream;
@@ -33,6 +33,7 @@ use datafusion::physical_plan::udaf::create_aggregate_expr as create_aggr_udf_ex
 use datafusion::physical_plan::{
    DisplayAs, DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream,
 };
+use datafusion::physical_planner::create_physical_sort_expr;
 use datafusion_common::utils::get_arrayref_at_indices;
 use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, ScalarValue};
 use datafusion_expr::utils::exprlist_to_fields;
@@ -54,22 +55,135 @@ use crate::error::{DataFusionSnafu, Result};

 type Millisecond = <TimestampMillisecondType as ArrowPrimitiveType>::Native;

-#[derive(PartialEq, Eq, Hash, Clone, Debug)]
+#[derive(PartialEq, Eq, Debug, Hash, Clone)]
+pub enum Fill {
+    Null,
+    Prev,
+    Linear,
+    Const(ScalarValue),
+}
+
+impl Display for Fill {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Fill::Null => write!(f, "NULL"),
+            Fill::Prev => write!(f, "PREV"),
+            Fill::Linear => write!(f, "LINEAR"),
+            Fill::Const(x) => write!(f, "{}", x),
+        }
+    }
+}
+
+impl Fill {
+    pub fn try_from_str(value: &str, datatype: &DataType) -> DfResult<Self> {
+        let s = value.to_uppercase();
+        match s.as_str() {
+            "NULL" | "" => Ok(Self::Null),
+            "PREV" => Ok(Self::Prev),
+            "LINEAR" => {
+                if datatype.is_numeric() {
+                    Ok(Self::Linear)
+                } else {
+                    Err(DataFusionError::Plan(format!(
+                        "Use FILL LINEAR on Non-numeric DataType {}",
+                        datatype
+                    )))
+                }
+            }
+            _ => ScalarValue::try_from_string(s.clone(), datatype)
+                .map_err(|err| {
+                    DataFusionError::Plan(format!(
+                        "{} is not a valid fill option, fail to convert to a const value. {{ {} }}",
+                        s, err
+                    ))
+                })
+                .map(Fill::Const),
+        }
+    }
+
+    /// The input `data` contains data on a complete time series.
+    /// If the filling strategy is `PREV` or `LINEAR`, caller must be ensured that the incoming `data` is ascending time order.
+    pub fn apply_fill_strategy(&self, data: &mut [ScalarValue]) -> DfResult<()> {
+        let len = data.len();
+        for i in 0..len {
+            if data[i].is_null() {
+                match self {
+                    Fill::Null => continue,
+                    Fill::Prev => {
+                        if i != 0 {
+                            data[i] = data[i - 1].clone()
+                        }
+                    }
+                    Fill::Linear => {
+                        if 0 < i && i < len - 1 {
+                            match (&data[i - 1], &data[i + 1]) {
+                                (ScalarValue::Float64(Some(a)), ScalarValue::Float64(Some(b))) => {
+                                    data[i] = ScalarValue::Float64(Some((a + b) / 2.0));
+                                }
+                                (ScalarValue::Float32(Some(a)), ScalarValue::Float32(Some(b))) => {
+                                    data[i] = ScalarValue::Float32(Some((a + b) / 2.0));
+                                }
+                                (a, b) => {
+                                    if !a.is_null() && !b.is_null() {
+                                        return Err(DataFusionError::Execution(
+                                            "RangePlan: Apply Fill LINEAR strategy on Non-floating type".to_string()));
+                                    } else {
+                                        continue;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    Fill::Const(v) => data[i] = v.clone(),
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+#[derive(Eq, Clone, Debug)]
 pub struct RangeFn {
+    /// with format like `max(a) 300s null`
+    pub name: String,
+    pub data_type: DataType,
    pub expr: Expr,
    pub range: Duration,
-    pub fill: String,
+    pub fill: Fill,
+    /// If the `FIll` strategy is `Linear` and the output is an integer,
+    /// it is possible to calculate a floating point number.
+    /// So for `FILL==LINEAR`, the entire data will be implicitly converted to Float type
+    /// If `need_cast==true`, `data_type` may not consist with type `expr` generated.
+    pub need_cast: bool,
+}
+
+impl PartialEq for RangeFn {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name
+    }
+}
+
+impl PartialOrd for RangeFn {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for RangeFn {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.name.cmp(&other.name)
+    }
+}
+
+impl std::hash::Hash for RangeFn {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.name.hash(state);
+    }
 }

 impl Display for RangeFn {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "RangeFn {{ expr:{} range:{}s fill:{} }}",
-            self.expr.display_name().unwrap_or("?".into()),
-            self.range.as_secs(),
-            self.fill,
-        )
+        write!(f, "{}", self.name)
    }
 }

@@ -105,16 +219,21 @@ impl RangeSelect {
    ) -> Result<Self> {
        let mut fields = range_expr
            .iter()
-            .map(|RangeFn { expr, .. }| {
-                Ok(DFField::new_unqualified(
-                    &expr.display_name()?,
-                    expr.get_type(input.schema())?,
-                    // TODO(Taylor-lagrange): We have not implemented fill currently,
-                    // it is possible that some columns may not be able to aggregate data,
-                    // so we temporarily set that all data is nullable
-                    true,
-                ))
-            })
+            .map(
+                |RangeFn {
+                     name,
+                     data_type,
+                     fill,
+                     ..
+                 }| {
+                    Ok(DFField::new_unqualified(
+                        name,
+                        data_type.clone(),
+                        // Only when data fill with Const option, the data can't be null
+                        !matches!(fill, Fill::Const(..)),
+                    ))
+                },
+            )
            .collect::<DfResult<Vec<_>>>()
            .context(DataFusionSnafu)?;
        // add align_ts
@@ -135,10 +254,8 @@ impl RangeSelect {
            DFSchema::new_with_metadata(by_fields, input.schema().metadata().clone())
                .context(DataFusionSnafu)?,
        );
-        // If the result of the project plan happens to be the schema of the range plan, no project plan is required
-        // that need project is identical to range plan schema.
-        // 1. all exprs in project must belong to range schema
-        // 2. range schema and project exprs must have same size
+        // If the results of project plan can be obtained directly from range plan without any additional calculations, no project plan is required.
+        // We can simply project the final output of the range plan to produce the final result.
        let schema_project = projection_expr
            .iter()
            .map(|project_expr| {
@@ -268,52 +385,68 @@ impl RangeSelect {
            .range_expr
            .iter()
            .map(|range_fn| {
-                let (expr, args) = match &range_fn.expr {
+                let expr = match &range_fn.expr {
                    Expr::AggregateFunction(aggr) => {
-                        let args = self.create_physical_expr_list(
-                            &aggr.args,
-                            input_dfschema,
-                            &input_schema,
-                            session_state,
-                        )?;
-                        Ok((
-                            create_aggr_expr(
-                                &aggr.fun,
-                                false,
-                                &args,
-                                &[],
+                        let order_by = if let Some(exprs) = &aggr.order_by {
+                            exprs
+                                .iter()
+                                .map(|x| {
+                                    create_physical_sort_expr(
+                                        x,
+                                        input_dfschema,
+                                        &input_schema,
+                                        session_state.execution_props(),
+                                    )
+                                })
+                                .collect::<DfResult<Vec<_>>>()?
+                        } else {
+                            vec![]
+                        };
+                        let expr = create_aggr_expr(
+                            &aggr.fun,
+                            false,
+                            &self.create_physical_expr_list(
+                                &aggr.args,
+                                input_dfschema,
                                &input_schema,
-                                range_fn.expr.display_name()?,
+                                session_state,
                            )?,
-                            args,
-                        ))
+                            &order_by,
+                            &input_schema,
+                            range_fn.expr.display_name()?,
+                        )?;
+                        Ok(expr)
                    }
                    Expr::AggregateUDF(aggr_udf) => {
-                        let args = self.create_physical_expr_list(
-                            &aggr_udf.args,
-                            input_dfschema,
-                            &input_schema,
-                            session_state,
-                        )?;
-                        Ok((
-                            create_aggr_udf_expr(
-                                &aggr_udf.fun,
-                                &args,
+                        let expr = create_aggr_udf_expr(
+                            &aggr_udf.fun,
+                            &self.create_physical_expr_list(
+                                &aggr_udf.args,
+                                input_dfschema,
                                &input_schema,
-                                range_fn.expr.display_name()?,
+                                session_state,
                            )?,
-                            args,
-                        ))
+                            &input_schema,
+                            range_fn.expr.display_name()?,
+                        )?;
+                        Ok(expr)
                    }
                    _ => Err(DataFusionError::Plan(format!(
                        "Unexpected Expr:{} in RangeSelect",
                        range_fn.expr.display_name()?
                    ))),
                }?;
+                let args = expr.expressions();
                Ok(RangeFnExec {
                    expr,
                    args,
                    range: range_fn.range.as_millis() as Millisecond,
+                    fill: range_fn.fill.clone(),
+                    need_cast: if range_fn.need_cast {
+                        Some(range_fn.data_type.clone())
+                    } else {
+                        None
+                    },
                })
            })
            .collect::<DfResult<Vec<_>>>()?;
@@ -348,6 +481,8 @@ struct RangeFnExec {
    pub expr: Arc<dyn AggregateExpr>,
    pub args: Vec<Arc<dyn PhysicalExpr>>,
    pub range: Millisecond,
+    pub fill: Fill,
+    pub need_cast: Option<DataType>,
 }

 #[derive(Debug)]
@@ -540,6 +675,15 @@ fn align_to_calendar(
    }
 }

+fn cast_scalar_values(values: &mut [ScalarValue], data_type: &DataType) -> DfResult<()> {
+    let array = ScalarValue::iter_to_array(values.to_vec())?;
+    let cast_array = cast_with_options(&array, data_type, &CastOptions::default())?;
+    for (i, value) in values.iter_mut().enumerate() {
+        *value = ScalarValue::try_from_array(&cast_array, i)?;
+    }
+    Ok(())
+}
+
 impl RangeSelectStream {
    fn evaluate_many(
        &self,
@@ -648,20 +792,57 @@ impl RangeSelectStream {
        let mut columns: Vec<Arc<dyn Array>> =
            Vec::with_capacity(1 + self.range_exec.len() + self.by.len());
        let mut ts_builder = TimestampMillisecondBuilder::with_capacity(self.output_num_rows);
-        let mut all_scalar = vec![vec![]; self.range_exec.len()];
+        let mut all_scalar = vec![Vec::with_capacity(self.output_num_rows); self.range_exec.len()];
        let mut by_rows = Vec::with_capacity(self.output_num_rows);
+        let mut start_index = 0;
+        // RangePlan is calculated on a row basis. If a column uses the PREV or LINEAR filling strategy,
+        // we must arrange the data in the entire data row to determine the NULL filling value.
+        let need_sort_output = self
+            .range_exec
+            .iter()
+            .any(|range| range.fill == Fill::Linear || range.fill == Fill::Prev);
        for SeriesState {
            row,
            align_ts_accumulator,
        } in self.series_map.values()
        {
-            for (ts, accumulators) in align_ts_accumulator {
-                for (i, accumulator) in accumulators.iter().enumerate() {
-                    all_scalar[i].push(accumulator.evaluate()?);
+            // collect data on time series
+            if !need_sort_output {
+                for (ts, accumulators) in align_ts_accumulator {
+                    for (i, accumulator) in accumulators.iter().enumerate() {
+                        all_scalar[i].push(accumulator.evaluate()?);
+                    }
+                    ts_builder.append_value(*ts);
                }
-                by_rows.push(row.row());
-                ts_builder.append_value(*ts);
+            } else {
+                let mut keys = align_ts_accumulator.keys().copied().collect::<Vec<_>>();
+                keys.sort();
+                for key in &keys {
+                    for (i, accumulator) in
+                        align_ts_accumulator.get(key).unwrap().iter().enumerate()
+                    {
+                        all_scalar[i].push(accumulator.evaluate()?);
+                    }
+                }
+                ts_builder.append_slice(&keys);
            }
+            // apply fill strategy on time series
+            for (
+                i,
+                RangeFnExec {
+                    fill, need_cast, ..
+                },
+            ) in self.range_exec.iter().enumerate()
+            {
+                let time_series_data =
+                    &mut all_scalar[i][start_index..start_index + align_ts_accumulator.len()];
+                if let Some(data_type) = need_cast {
+                    cast_scalar_values(time_series_data, data_type)?;
+                }
+                fill.apply_fill_strategy(time_series_data)?;
+            }
+            by_rows.resize(by_rows.len() + align_ts_accumulator.len(), row.row());
+            start_index += align_ts_accumulator.len();
        }
        for column_scalar in all_scalar {
            columns.push(ScalarValue::iter_to_array(column_scalar)?);
@@ -720,15 +901,15 @@ impl Stream for RangeSelectStream {
                }
                ExecutionState::ProducingOutput => {
                    let result = self.generate_output();
-                    match result {
+                    return match result {
                        // made output
                        Ok(batch) => {
                            self.exec_state = ExecutionState::Done;
-                            return Poll::Ready(Some(Ok(batch)));
+                            Poll::Ready(Some(Ok(batch)))
                        }
                        // error making output
-                        Err(error) => return Poll::Ready(Some(Err(error))),
-                    }
+                        Err(error) => Poll::Ready(Some(Err(error))),
+                    };
                }
                ExecutionState::Done => return Poll::Ready(None),
            }
@@ -738,6 +919,34 @@ impl Stream for RangeSelectStream {

 #[cfg(test)]
 mod test {
+    macro_rules! nullable_array {
+        ($builder:ident,) => {
+        };
+        ($array_type:ident ; $($tail:tt)*) => {
+            paste::item! {
+                {
+                    let mut builder = arrow::array::[<$array_type Builder>]::new();
+                    nullable_array!(builder, $($tail)*);
+                    builder.finish()
+                }
+            }
+        };
+        ($builder:ident, null) => {
+            $builder.append_null();
+        };
+        ($builder:ident, null, $($tail:tt)*) => {
+            $builder.append_null();
+            nullable_array!($builder, $($tail)*);
+        };
+        ($builder:ident, $value:literal) => {
+            $builder.append_value($value);
+        };
+        ($builder:ident, $value:literal, $($tail:tt)*) => {
+            $builder.append_value($value);
+            nullable_array!($builder, $($tail)*);
+        };
+    }
+
    use arrow_schema::SortOptions;
    use datafusion::arrow::datatypes::{
        ArrowPrimitiveType, DataType, Field, Schema, TimestampMillisecondType,
@@ -747,33 +956,45 @@ mod test {
    use datafusion::prelude::SessionContext;
    use datafusion_physical_expr::expressions::{self, Column};
    use datafusion_physical_expr::PhysicalSortExpr;
-    use datatypes::arrow::array::{Int64Array, TimestampMillisecondArray};
+    use datatypes::arrow::array::TimestampMillisecondArray;
    use datatypes::arrow_array::StringArray;

    use super::*;

    const TIME_INDEX_COLUMN: &str = "timestamp";

-    fn prepare_test_data() -> MemoryExec {
+    fn prepare_test_data(is_float: bool) -> MemoryExec {
        let schema = Arc::new(Schema::new(vec![
            Field::new(TIME_INDEX_COLUMN, TimestampMillisecondType::DATA_TYPE, true),
-            Field::new("value", DataType::Int64, true),
+            Field::new(
+                "value",
+                if is_float {
+                    DataType::Float64
+                } else {
+                    DataType::Int64
+                },
+                true,
+            ),
            Field::new("host", DataType::Utf8, true),
        ]));
-        let timestamp_column = Arc::new(TimestampMillisecondArray::from(vec![
-            // host 1 every 5s
-            0, 5_000, 10_000, 15_000, 20_000, 25_000, 30_000, 35_000, 40_000,
-            // host 2 every 5s
-            0, 5_000, 10_000, 15_000, 20_000, 25_000, 30_000, 35_000, 40_000,
+        let timestamp_column: Arc<dyn Array> = Arc::new(TimestampMillisecondArray::from(vec![
+            0, 5_000, 10_000, 15_000, 20_000, // host 1 every 5s
+            0, 5_000, 10_000, 15_000, 20_000, // host 2 every 5s
        ])) as _;
-        let values = vec![
-            0, 1, 2, 3, 4, 5, 6, 7, 8, // data for host 1
-            9, 10, 11, 12, 13, 14, 15, 16, 17, // data for host 2
-        ];
-        let mut host = vec!["host1"; 9];
-        host.extend(vec!["host2"; 9]);
-        let value_column = Arc::new(Int64Array::from(values)) as _;
-        let host_column = Arc::new(StringArray::from(host)) as _;
+        let mut host = vec!["host1"; 5];
+        host.extend(vec!["host2"; 5]);
+        let value_column: Arc<dyn Array> = if is_float {
+            Arc::new(nullable_array!(Float64;
+                0.0, null, 1.0, null, 2.0, // data for host 1
+                3.0, null, 4.0, null, 5.0 // data for host 2
+            )) as _
+        } else {
+            Arc::new(nullable_array!(Int64;
+                0, null, 1, null, 2, // data for host 1
+                3, null, 4, null, 5 // data for host 2
+            )) as _
+        };
+        let host_column: Arc<dyn Array> = Arc::new(StringArray::from(host)) as _;
        let data = RecordBatch::try_new(
            schema.clone(),
            vec![timestamp_column, value_column, host_column],
@@ -787,12 +1008,25 @@ mod test {
        range1: Millisecond,
        range2: Millisecond,
        align: Millisecond,
+        fill: Fill,
+        is_float: bool,
        expected: String,
    ) {
-        let memory_exec = Arc::new(prepare_test_data());
+        let data_type = if is_float {
+            DataType::Float64
+        } else {
+            DataType::Int64
+        };
+        let (need_cast, schema_data_type) = if !is_float && fill == Fill::Linear {
+            // data_type = DataType::Float64;
+            (Some(DataType::Float64), DataType::Float64)
+        } else {
+            (None, data_type.clone())
+        };
+        let memory_exec = Arc::new(prepare_test_data(is_float));
        let schema = Arc::new(Schema::new(vec![
-            Field::new("MIN(value)", DataType::Int64, true),
-            Field::new("MAX(value)", DataType::Int64, true),
+            Field::new("MIN(value)", schema_data_type.clone(), true),
+            Field::new("MAX(value)", schema_data_type, true),
            Field::new(TIME_INDEX_COLUMN, TimestampMillisecondType::DATA_TYPE, true),
            Field::new("host", DataType::Utf8, true),
        ]));
@@ -803,19 +1037,23 @@ mod test {
                    expr: Arc::new(expressions::Min::new(
                        Arc::new(Column::new("value", 1)),
                        "MIN(value)",
-                        DataType::Int64,
+                        data_type.clone(),
                    )),
                    args: vec![Arc::new(Column::new("value", 1))],
                    range: range1,
+                    fill: fill.clone(),
+                    need_cast: need_cast.clone(),
                },
                RangeFnExec {
                    expr: Arc::new(expressions::Max::new(
                        Arc::new(Column::new("value", 1)),
                        "MAX(value)",
-                        DataType::Int64,
+                        data_type,
                    )),
                    args: vec![Arc::new(Column::new("value", 1))],
                    range: range2,
+                    fill,
+                    need_cast,
                },
            ],
            align,
@@ -852,85 +1090,225 @@ mod test {
                .await
                .unwrap();

-        let result_literal = datatypes::arrow::util::pretty::pretty_format_batches(&result)
+        let result_literal = arrow::util::pretty::pretty_format_batches(&result)
            .unwrap()
            .to_string();

        assert_eq!(result_literal, expected);
    }

-    #[tokio::test]
-    async fn range_10s_align_5s() {
-        let expected = String::from(
-            "+------------+------------+---------------------+-------+\
-            \n| MIN(value) | MAX(value) | timestamp           | host  |\
-            \n+------------+------------+---------------------+-------+\
-            \n| 0          | 0          | 1970-01-01T00:00:00 | host1 |\
-            \n| 0          | 1          | 1970-01-01T00:00:05 | host1 |\
-            \n| 1          | 2          | 1970-01-01T00:00:10 | host1 |\
-            \n| 2          | 3          | 1970-01-01T00:00:15 | host1 |\
-            \n| 3          | 4          | 1970-01-01T00:00:20 | host1 |\
-            \n| 4          | 5          | 1970-01-01T00:00:25 | host1 |\
-            \n| 5          | 6          | 1970-01-01T00:00:30 | host1 |\
-            \n| 6          | 7          | 1970-01-01T00:00:35 | host1 |\
-            \n| 7          | 8          | 1970-01-01T00:00:40 | host1 |\
-            \n| 8          | 8          | 1970-01-01T00:00:45 | host1 |\
-            \n| 9          | 9          | 1970-01-01T00:00:00 | host2 |\
-            \n| 9          | 10         | 1970-01-01T00:00:05 | host2 |\
-            \n| 10         | 11         | 1970-01-01T00:00:10 | host2 |\
-            \n| 11         | 12         | 1970-01-01T00:00:15 | host2 |\
-            \n| 12         | 13         | 1970-01-01T00:00:20 | host2 |\
-            \n| 13         | 14         | 1970-01-01T00:00:25 | host2 |\
-            \n| 14         | 15         | 1970-01-01T00:00:30 | host2 |\
-            \n| 15         | 16         | 1970-01-01T00:00:35 | host2 |\
-            \n| 16         | 17         | 1970-01-01T00:00:40 | host2 |\
-            \n| 17         | 17         | 1970-01-01T00:00:45 | host2 |\
-            \n+------------+------------+---------------------+-------+",
-        );
-        do_range_select_test(10_000, 10_000, 5_000, expected).await;
-    }
-
    #[tokio::test]
    async fn range_10s_align_1000s() {
        let expected = String::from(
            "+------------+------------+---------------------+-------+\
            \n| MIN(value) | MAX(value) | timestamp           | host  |\
            \n+------------+------------+---------------------+-------+\
-            \n| 0          | 0          | 1970-01-01T00:00:00 | host1 |\
-            \n| 9          | 9          | 1970-01-01T00:00:00 | host2 |\
+            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
+            \n| 3.0        | 3.0        | 1970-01-01T00:00:00 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
-        do_range_select_test(10_000, 10_000, 1_000_000, expected).await;
+        do_range_select_test(10_000, 10_000, 1_000_000, Fill::Null, true, expected).await;
    }

    #[tokio::test]
-    async fn range_10s_5s_align_5s() {
+    async fn range_fill_null() {
+        let expected = String::from(
+            "+------------+------------+---------------------+-------+\
+            \n| MIN(value) | MAX(value) | timestamp           | host  |\
+            \n+------------+------------+---------------------+-------+\
+            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
+            \n| 0.0        |            | 1970-01-01T00:00:05 | host1 |\
+            \n| 1.0        | 1.0        | 1970-01-01T00:00:10 | host1 |\
+            \n| 1.0        |            | 1970-01-01T00:00:15 | host1 |\
+            \n| 2.0        | 2.0        | 1970-01-01T00:00:20 | host1 |\
+            \n| 2.0        |            | 1970-01-01T00:00:25 | host1 |\
+            \n| 3.0        | 3.0        | 1970-01-01T00:00:00 | host2 |\
+            \n| 3.0        |            | 1970-01-01T00:00:05 | host2 |\
+            \n| 4.0        | 4.0        | 1970-01-01T00:00:10 | host2 |\
+            \n| 4.0        |            | 1970-01-01T00:00:15 | host2 |\
+            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
+            \n| 5.0        |            | 1970-01-01T00:00:25 | host2 |\
+            \n+------------+------------+---------------------+-------+",
+        );
+        do_range_select_test(10_000, 5_000, 5_000, Fill::Null, true, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_fill_prev() {
+        let expected = String::from(
+            "+------------+------------+---------------------+-------+\
+            \n| MIN(value) | MAX(value) | timestamp           | host  |\
+            \n+------------+------------+---------------------+-------+\
+            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
+            \n| 0.0        | 0.0        | 1970-01-01T00:00:05 | host1 |\
+            \n| 1.0        | 1.0        | 1970-01-01T00:00:10 | host1 |\
+            \n| 1.0        | 1.0        | 1970-01-01T00:00:15 | host1 |\
+            \n| 2.0        | 2.0        | 1970-01-01T00:00:20 | host1 |\
+            \n| 2.0        | 2.0        | 1970-01-01T00:00:25 | host1 |\
+            \n| 3.0        | 3.0        | 1970-01-01T00:00:00 | host2 |\
+            \n| 3.0        | 3.0        | 1970-01-01T00:00:05 | host2 |\
+            \n| 4.0        | 4.0        | 1970-01-01T00:00:10 | host2 |\
+            \n| 4.0        | 4.0        | 1970-01-01T00:00:15 | host2 |\
+            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
+            \n| 5.0        | 5.0        | 1970-01-01T00:00:25 | host2 |\
+            \n+------------+------------+---------------------+-------+",
+        );
+        do_range_select_test(10_000, 5_000, 5_000, Fill::Prev, true, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_fill_linear() {
+        let expected = String::from(
+            "+------------+------------+---------------------+-------+\
+            \n| MIN(value) | MAX(value) | timestamp           | host  |\
+            \n+------------+------------+---------------------+-------+\
+            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
+            \n| 0.0        | 0.5        | 1970-01-01T00:00:05 | host1 |\
+            \n| 1.0        | 1.0        | 1970-01-01T00:00:10 | host1 |\
+            \n| 1.0        | 1.5        | 1970-01-01T00:00:15 | host1 |\
+            \n| 2.0        | 2.0        | 1970-01-01T00:00:20 | host1 |\
+            \n| 2.0        |            | 1970-01-01T00:00:25 | host1 |\
+            \n| 3.0        | 3.0        | 1970-01-01T00:00:00 | host2 |\
+            \n| 3.0        | 3.5        | 1970-01-01T00:00:05 | host2 |\
+            \n| 4.0        | 4.0        | 1970-01-01T00:00:10 | host2 |\
+            \n| 4.0        | 4.5        | 1970-01-01T00:00:15 | host2 |\
+            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
+            \n| 5.0        |            | 1970-01-01T00:00:25 | host2 |\
+            \n+------------+------------+---------------------+-------+",
+        );
+        do_range_select_test(10_000, 5_000, 5_000, Fill::Linear, true, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_fill_integer_null() {
        let expected = String::from(
            "+------------+------------+---------------------+-------+\
            \n| MIN(value) | MAX(value) | timestamp           | host  |\
            \n+------------+------------+---------------------+-------+\
            \n| 0          | 0          | 1970-01-01T00:00:00 | host1 |\
-            \n| 0          | 1          | 1970-01-01T00:00:05 | host1 |\
-            \n| 1          | 2          | 1970-01-01T00:00:10 | host1 |\
-            \n| 2          | 3          | 1970-01-01T00:00:15 | host1 |\
-            \n| 3          | 4          | 1970-01-01T00:00:20 | host1 |\
-            \n| 4          | 5          | 1970-01-01T00:00:25 | host1 |\
-            \n| 5          | 6          | 1970-01-01T00:00:30 | host1 |\
-            \n| 6          | 7          | 1970-01-01T00:00:35 | host1 |\
-            \n| 7          | 8          | 1970-01-01T00:00:40 | host1 |\
-            \n| 8          |            | 1970-01-01T00:00:45 | host1 |\
-            \n| 9          | 9          | 1970-01-01T00:00:00 | host2 |\
-            \n| 9          | 10         | 1970-01-01T00:00:05 | host2 |\
-            \n| 10         | 11         | 1970-01-01T00:00:10 | host2 |\
-            \n| 11         | 12         | 1970-01-01T00:00:15 | host2 |\
-            \n| 12         | 13         | 1970-01-01T00:00:20 | host2 |\
-            \n| 13         | 14         | 1970-01-01T00:00:25 | host2 |\
-            \n| 14         | 15         | 1970-01-01T00:00:30 | host2 |\
-            \n| 15         | 16         | 1970-01-01T00:00:35 | host2 |\
-            \n| 16         | 17         | 1970-01-01T00:00:40 | host2 |\
-            \n| 17         |            | 1970-01-01T00:00:45 | host2 |\
+            \n| 0          |            | 1970-01-01T00:00:05 | host1 |\
+            \n| 1          | 1          | 1970-01-01T00:00:10 | host1 |\
+            \n| 1          |            | 1970-01-01T00:00:15 | host1 |\
+            \n| 2          | 2          | 1970-01-01T00:00:20 | host1 |\
+            \n| 2          |            | 1970-01-01T00:00:25 | host1 |\
+            \n| 3          | 3          | 1970-01-01T00:00:00 | host2 |\
+            \n| 3          |            | 1970-01-01T00:00:05 | host2 |\
+            \n| 4          | 4          | 1970-01-01T00:00:10 | host2 |\
+            \n| 4          |            | 1970-01-01T00:00:15 | host2 |\
+            \n| 5          | 5          | 1970-01-01T00:00:20 | host2 |\
+            \n| 5          |            | 1970-01-01T00:00:25 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
-        do_range_select_test(10_000, 5_000, 5_000, expected).await;
+        do_range_select_test(10_000, 5_000, 5_000, Fill::Null, false, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_fill_integer_linear() {
+        let expected = String::from(
+            "+------------+------------+---------------------+-------+\
+            \n| MIN(value) | MAX(value) | timestamp           | host  |\
+            \n+------------+------------+---------------------+-------+\
+            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
+            \n| 0.0        | 0.5        | 1970-01-01T00:00:05 | host1 |\
+            \n| 1.0        | 1.0        | 1970-01-01T00:00:10 | host1 |\
+            \n| 1.0        | 1.5        | 1970-01-01T00:00:15 | host1 |\
+            \n| 2.0        | 2.0        | 1970-01-01T00:00:20 | host1 |\
+            \n| 2.0        |            | 1970-01-01T00:00:25 | host1 |\
+            \n| 3.0        | 3.0        | 1970-01-01T00:00:00 | host2 |\
+            \n| 3.0        | 3.5        | 1970-01-01T00:00:05 | host2 |\
+            \n| 4.0        | 4.0        | 1970-01-01T00:00:10 | host2 |\
+            \n| 4.0        | 4.5        | 1970-01-01T00:00:15 | host2 |\
+            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
+            \n| 5.0        |            | 1970-01-01T00:00:25 | host2 |\
+            \n+------------+------------+---------------------+-------+",
+        );
+        do_range_select_test(10_000, 5_000, 5_000, Fill::Linear, false, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_fill_const() {
+        let expected = String::from(
+            "+------------+------------+---------------------+-------+\
+            \n| MIN(value) | MAX(value) | timestamp           | host  |\
+            \n+------------+------------+---------------------+-------+\
+            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
+            \n| 0.0        | 6.6        | 1970-01-01T00:00:05 | host1 |\
+            \n| 1.0        | 1.0        | 1970-01-01T00:00:10 | host1 |\
+            \n| 1.0        | 6.6        | 1970-01-01T00:00:15 | host1 |\
+            \n| 2.0        | 2.0        | 1970-01-01T00:00:20 | host1 |\
+            \n| 2.0        | 6.6        | 1970-01-01T00:00:25 | host1 |\
+            \n| 3.0        | 3.0        | 1970-01-01T00:00:00 | host2 |\
+            \n| 3.0        | 6.6        | 1970-01-01T00:00:05 | host2 |\
+            \n| 4.0        | 4.0        | 1970-01-01T00:00:10 | host2 |\
+            \n| 4.0        | 6.6        | 1970-01-01T00:00:15 | host2 |\
+            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
+            \n| 5.0        | 6.6        | 1970-01-01T00:00:25 | host2 |\
+            \n+------------+------------+---------------------+-------+",
+        );
+        do_range_select_test(
+            10_000,
+            5_000,
+            5_000,
+            Fill::Const(ScalarValue::Float64(Some(6.6))),
+            true,
+            expected,
+        )
+        .await;
+    }
+
+    #[test]
+    fn fill_test() {
+        assert!(Fill::try_from_str("Linear", &DataType::UInt8).unwrap() == Fill::Linear);
+        assert_eq!(
+            Fill::try_from_str("Linear", &DataType::Boolean)
+                .unwrap_err()
+                .to_string(),
+            "Error during planning: Use FILL LINEAR on Non-numeric DataType Boolean"
+        );
+        assert_eq!(
+            Fill::try_from_str("WHAT", &DataType::UInt8)
+                .unwrap_err()
+                .to_string(),
+                "Error during planning: WHAT is not a valid fill option, fail to convert to a const value. { Arrow error: Cast error: Cannot cast string 'WHAT' to value of UInt8 type }"
+        );
+        assert_eq!(
+            Fill::try_from_str("8.0", &DataType::UInt8)
+                .unwrap_err()
+                .to_string(),
+                "Error during planning: 8.0 is not a valid fill option, fail to convert to a const value. { Arrow error: Cast error: Cannot cast string '8.0' to value of UInt8 type }"
+        );
+        assert!(
+            Fill::try_from_str("8", &DataType::UInt8).unwrap()
+                == Fill::Const(ScalarValue::UInt8(Some(8)))
+        );
+        let mut test1 = vec![
+            ScalarValue::UInt8(Some(8)),
+            ScalarValue::UInt8(None),
+            ScalarValue::UInt8(Some(9)),
+        ];
+        Fill::Null.apply_fill_strategy(&mut test1).unwrap();
+        assert_eq!(test1[1], ScalarValue::UInt8(None));
+        Fill::Prev.apply_fill_strategy(&mut test1).unwrap();
+        assert_eq!(test1[1], ScalarValue::UInt8(Some(8)));
+        test1[1] = ScalarValue::UInt8(None);
+        Fill::Const(ScalarValue::UInt8(Some(10)))
+            .apply_fill_strategy(&mut test1)
+            .unwrap();
+        assert_eq!(test1[1], ScalarValue::UInt8(Some(10)));
+        test1[1] = ScalarValue::UInt8(None);
+        assert_eq!(
+            Fill::Linear
+                .apply_fill_strategy(&mut test1)
+                .unwrap_err()
+                .to_string(),
+            "Execution error: RangePlan: Apply Fill LINEAR strategy on Non-floating type"
+        );
+        let mut test2 = vec![
+            ScalarValue::Float32(Some(8.0)),
+            ScalarValue::Float32(None),
+            ScalarValue::Float32(Some(9.0)),
+        ];
+        Fill::Linear.apply_fill_strategy(&mut test2).unwrap();
+        assert_eq!(test2[1], ScalarValue::Float32(Some(8.5)));
    }
 }
--- a/src/query/src/range_select/plan_rewrite.rs
+++ b/src/query/src/range_select/plan_rewrite.rs
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::str::FromStr;
+use std::collections::BTreeSet;
 use std::sync::Arc;
 use std::time::Duration;

+use arrow_schema::DataType;
 use async_recursion::async_recursion;
 use catalog::table_source::DfTableSourceProvider;
 use datafusion::datasource::DefaultTableSource;
@@ -23,47 +24,62 @@ use datafusion::prelude::Column;
 use datafusion::scalar::ScalarValue;
 use datafusion_common::tree_node::{TreeNode, TreeNodeRewriter, VisitRecursion};
 use datafusion_common::{DFSchema, DataFusionError, Result as DFResult};
-use datafusion_expr::expr::{AggregateFunction, AggregateUDF, ScalarUDF};
+use datafusion_expr::expr::ScalarUDF;
 use datafusion_expr::{
-    AggregateFunction as AggregateFn, Expr, Extension, LogicalPlan, LogicalPlanBuilder, Projection,
+    Aggregate, Expr, ExprSchemable, Extension, LogicalPlan, LogicalPlanBuilder, Projection,
 };
-use datafusion_sql::planner::ContextProvider;
 use datatypes::prelude::ConcreteDataType;
 use promql_parser::util::parse_duration;
 use snafu::{OptionExt, ResultExt};
 use table::table::adapter::DfTableProviderAdapter;

+use super::plan::Fill;
 use crate::error::{
-    CatalogSnafu, DataFusionSnafu, Result, TimeIndexNotFoundSnafu, UnknownTableSnafu,
+    CatalogSnafu, DataFusionSnafu, RangeQuerySnafu, Result, TimeIndexNotFoundSnafu,
+    UnknownTableSnafu,
 };
 use crate::range_select::plan::{RangeFn, RangeSelect};
-use crate::DfContextProviderAdapter;

 /// `RangeExprRewriter` will recursively search certain `Expr`, find all `range_fn` scalar udf contained in `Expr`,
 /// and collect the information required by the RangeSelect query,
 /// and finally modify the `range_fn` scalar udf to an ordinary column field.
 pub struct RangeExprRewriter<'a> {
+    input_plan: &'a Arc<LogicalPlan>,
    align: Duration,
    by: Vec<Expr>,
-    range_fn: Vec<RangeFn>,
-    context_provider: &'a DfContextProviderAdapter,
+    /// Use `BTreeSet` to avoid in case like `avg(a) RANGE '5m' + avg(a) RANGE '5m'`, duplicate range expr `avg(a) RANGE '5m'` be calculate twice
+    range_fn: BTreeSet<RangeFn>,
+    sub_aggr: &'a Aggregate,
+}
+
+#[inline]
+fn dispose_parse_error(expr: Option<&Expr>) -> DataFusionError {
+    DataFusionError::Plan(
+        expr.map(|x| {
+            format!(
+                "Illegal argument `{}` in range select query",
+                x.display_name().unwrap_or_default()
+            )
+        })
+        .unwrap_or("Missing argument in range select query".into()),
+    )
 }

 impl<'a> RangeExprRewriter<'a> {
-    pub fn gen_range_expr(&self, func_name: &str, args: Vec<Expr>) -> DFResult<Expr> {
-        match AggregateFn::from_str(func_name) {
-            Ok(agg_fn) => Ok(Expr::AggregateFunction(AggregateFunction::new(
-                agg_fn, args, false, None, None,
-            ))),
-            Err(_) => match self.context_provider.get_aggregate_meta(func_name) {
-                Some(agg_udf) => Ok(Expr::AggregateUDF(AggregateUDF::new(
-                    agg_udf, args, None, None,
-                ))),
-                None => Err(DataFusionError::Plan(format!(
-                    "{} is not a Aggregate function or a Aggregate UDF",
-                    func_name
-                ))),
-            },
+    pub fn get_range_expr(&self, args: &[Expr], i: usize) -> DFResult<Expr> {
+        match args.get(i) {
+            Some(Expr::Column(column)) => {
+                let index = self.sub_aggr.schema.index_of_column(column)?;
+                let len = self.sub_aggr.group_expr.len();
+                self.sub_aggr
+                    .aggr_expr
+                    .get(index - len)
+                    .cloned()
+                    .ok_or(DataFusionError::Plan(
+                        "Range expr not found in underlying Aggregate Plan".into(),
+                    ))
+            }
+            other => Err(dispose_parse_error(other)),
        }
    }
 }
@@ -71,9 +87,7 @@ impl<'a> RangeExprRewriter<'a> {
 fn parse_str_expr(args: &[Expr], i: usize) -> DFResult<&str> {
    match args.get(i) {
        Some(Expr::Literal(ScalarValue::Utf8(Some(str)))) => Ok(str.as_str()),
-        _ => Err(DataFusionError::Plan(
-            "Illegal argument in range select query".into(),
-        )),
+        other => Err(dispose_parse_error(other)),
    }
 }

@@ -88,10 +102,8 @@ fn parse_expr_list(args: &[Expr], start: usize, len: usize) -> DFResult<Vec<Expr
                | Expr::ScalarFunction(_)
                | Expr::ScalarUDF(_),
            ) => args[i].clone(),
-            _ => {
-                return Err(DataFusionError::Plan(
-                    "Illegal expr argument in range select query".into(),
-                ))
+            other => {
+                return Err(dispose_parse_error(*other));
            }
        });
    }
@@ -104,23 +116,22 @@ impl<'a> TreeNodeRewriter for RangeExprRewriter<'a> {
    fn mutate(&mut self, node: Expr) -> DFResult<Expr> {
        if let Expr::ScalarUDF(func) = &node {
            if func.fun.name == "range_fn" {
-                // `range_fn(func_name, argc, [argv], range, fill, byc, [byv], align)`
-                // `argsv` and `byv` are variadic arguments, argc/byc indicate the length of arguments
-                let func_name = parse_str_expr(&func.args, 0)?;
-                let argc = str::parse::<usize>(parse_str_expr(&func.args, 1)?)
+                // `range_fn(func, range, fill, byc, [byv], align)`
+                // `[byv]` are variadic arguments, byc indicate the length of arguments
+                let range_expr = self.get_range_expr(&func.args, 0)?;
+                let range_str = parse_str_expr(&func.args, 1)?;
+                let byc = str::parse::<usize>(parse_str_expr(&func.args, 3)?)
                    .map_err(|e| DataFusionError::Plan(e.to_string()))?;
-                let byc = str::parse::<usize>(parse_str_expr(&func.args, argc + 4)?)
-                    .map_err(|e| DataFusionError::Plan(e.to_string()))?;
-                let mut range_fn = RangeFn {
-                    expr: Expr::Wildcard,
-                    range: parse_duration(parse_str_expr(&func.args, argc + 2)?)
-                        .map_err(DataFusionError::Plan)?,
-                    fill: parse_str_expr(&func.args, argc + 3)?.to_string(),
-                };
-                let args = parse_expr_list(&func.args, 2, argc)?;
-                let by = parse_expr_list(&func.args, argc + 5, byc)?;
-                let align = parse_duration(parse_str_expr(&func.args, argc + byc + 5)?)
+                let by = parse_expr_list(&func.args, 4, byc)?;
+                let align = parse_duration(parse_str_expr(&func.args, byc + 4)?)
                    .map_err(DataFusionError::Plan)?;
+                let mut data_type = range_expr.get_type(self.input_plan.schema())?;
+                let mut need_cast = false;
+                let fill = Fill::try_from_str(parse_str_expr(&func.args, 2)?, &data_type)?;
+                if matches!(fill, Fill::Linear) && data_type.is_integer() {
+                    data_type = DataType::Float64;
+                    need_cast = true;
+                }
                if !self.by.is_empty() && self.by != by {
                    return Err(DataFusionError::Plan(
                        "Inconsistent by given in Range Function Rewrite".into(),
@@ -135,9 +146,21 @@ impl<'a> TreeNodeRewriter for RangeExprRewriter<'a> {
                } else {
                    self.align = align;
                }
-                range_fn.expr = self.gen_range_expr(func_name, args)?;
-                let alias = Expr::Column(Column::from_name(range_fn.expr.display_name()?));
-                self.range_fn.push(range_fn);
+                let range_fn = RangeFn {
+                    name: format!(
+                        "{} RANGE {} FILL {}",
+                        range_expr.display_name()?,
+                        range_str,
+                        fill
+                    ),
+                    data_type,
+                    expr: range_expr,
+                    range: parse_duration(range_str).map_err(DataFusionError::Plan)?,
+                    fill,
+                    need_cast,
+                };
+                let alias = Expr::Column(Column::from_name(range_fn.name.clone()));
+                self.range_fn.insert(range_fn);
                return Ok(alias);
            }
        }
@@ -146,25 +169,18 @@ impl<'a> TreeNodeRewriter for RangeExprRewriter<'a> {
 }

 /// In order to implement RangeSelect query like `avg(field_0) RANGE '5m' FILL NULL`,
-/// All RangeSelect query items are converted into udf scalar function in sql parse stage, with format like `range_fn('avg', .....)`.
+/// All RangeSelect query items are converted into udf scalar function in sql parse stage, with format like `range_fn(avg(field_0), .....)`.
 /// `range_fn` contains all the parameters we need to execute RangeSelect.
 /// In order to correctly execute the query process of range select, we need to modify the query plan generated by datafusion.
 /// We need to recursively find the entire LogicalPlan, and find all `range_fn` scalar udf contained in the project plan,
 /// collecting info we need to generate RangeSelect Query LogicalPlan and rewrite th original LogicalPlan.
 pub struct RangePlanRewriter {
    table_provider: DfTableSourceProvider,
-    context_provider: DfContextProviderAdapter,
 }

 impl RangePlanRewriter {
-    pub fn new(
-        table_provider: DfTableSourceProvider,
-        context_provider: DfContextProviderAdapter,
-    ) -> Self {
-        Self {
-            table_provider,
-            context_provider,
-        }
+    pub fn new(table_provider: DfTableSourceProvider) -> Self {
+        Self { table_provider }
    }

    pub async fn rewrite(&mut self, plan: LogicalPlan) -> Result<LogicalPlan> {
@@ -185,17 +201,28 @@ impl RangePlanRewriter {
            LogicalPlan::Projection(Projection { expr, input, .. })
                if have_range_in_exprs(expr) =>
            {
-                let input = if let Some(new_input) = new_inputs[0].take() {
-                    Arc::new(new_input)
+                let (aggr_plan, input) = if let LogicalPlan::Aggregate(aggr) = input.as_ref() {
+                    // Expr like `rate(max(a) RANGE '6m') RANGE '6m'` have legal syntax but illegal semantic.
+                    if have_range_in_exprs(&aggr.aggr_expr) {
+                        return RangeQuerySnafu {
+                            msg: "Nest Range Query is not allowed",
+                        }
+                        .fail();
+                    }
+                    (aggr, aggr.input.clone())
                } else {
-                    input.clone()
+                    return RangeQuerySnafu {
+                        msg: "Window functions is not allowed in Range Query",
+                    }
+                    .fail();
                };
-                let (time_index, default_by) = self.get_index_by(input.schema().clone()).await?;
+                let (time_index, default_by) = self.get_index_by(input.schema()).await?;
                let mut range_rewriter = RangeExprRewriter {
+                    input_plan: &input,
                    align: Duration::default(),
                    by: vec![],
-                    range_fn: vec![],
-                    context_provider: &self.context_provider,
+                    range_fn: BTreeSet::new(),
+                    sub_aggr: aggr_plan,
                };
                let new_expr = expr
                    .iter()
@@ -207,7 +234,7 @@ impl RangePlanRewriter {
                }
                let range_select = RangeSelect::try_new(
                    input.clone(),
-                    range_rewriter.range_fn,
+                    range_rewriter.range_fn.into_iter().collect(),
                    range_rewriter.align,
                    time_index,
                    range_rewriter.by,
@@ -252,7 +279,7 @@ impl RangePlanRewriter {
    /// return `(time_index, [row_columns])` to the rewriter.
    /// If the user does not explicitly use the `by` keyword to indicate time series,
    /// `[row_columns]` will be use as default time series
-    async fn get_index_by(&mut self, schema: Arc<DFSchema>) -> Result<(Expr, Vec<Expr>)> {
+    async fn get_index_by(&mut self, schema: &Arc<DFSchema>) -> Result<(Expr, Vec<Expr>)> {
        let mut time_index_expr = Expr::Wildcard;
        let mut default_by = vec![];
        for field in schema.fields() {
@@ -303,28 +330,27 @@ impl RangePlanRewriter {
    }
 }

-fn have_range_in_exprs(exprs: &Vec<Expr>) -> bool {
-    let mut have = false;
-    for expr in exprs {
+fn have_range_in_exprs(exprs: &[Expr]) -> bool {
+    exprs.iter().any(|expr| {
+        let mut find_range = false;
        let _ = expr.apply(&mut |expr| {
            if let Expr::ScalarUDF(ScalarUDF { fun, .. }) = expr {
                if fun.name == "range_fn" {
-                    have = true;
+                    find_range = true;
                    return Ok(VisitRecursion::Stop);
                }
            }
            Ok(VisitRecursion::Continue)
        });
-        if have {
-            break;
-        }
-    }
-    have
+        find_range
+    })
 }

 #[cfg(test)]
 mod test {

+    use std::error::Error;
+
    use catalog::memory::MemoryCatalogManager;
    use catalog::RegisterTableRequest;
    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
@@ -391,14 +417,14 @@ mod test {
        QueryEngineFactory::new(catalog_list, None, None, false).query_engine()
    }

-    async fn query_plan_compare(sql: &str, expected: String) {
+    async fn do_query(sql: &str) -> Result<crate::plan::LogicalPlan> {
        let stmt = QueryLanguageParser::parse_sql(sql).unwrap();
        let engine = create_test_engine().await;
-        let GreptimeLogicalPlan::DfPlan(plan) = engine
-            .planner()
-            .plan(stmt, QueryContext::arc())
-            .await
-            .unwrap();
+        engine.planner().plan(stmt, QueryContext::arc()).await
+    }
+
+    async fn query_plan_compare(sql: &str, expected: String) {
+        let GreptimeLogicalPlan::DfPlan(plan) = do_query(sql).await.unwrap();
        assert_eq!(plan.display_indent_schema().to_string(), expected);
    }

@@ -406,7 +432,7 @@ mod test {
    async fn range_no_project() {
        let query = r#"SELECT timestamp, tag_0, tag_1, avg(field_0 + field_1) RANGE '5m' FROM test ALIGN '1h' by (tag_0,tag_1);"#;
        let expected = String::from(
-            "RangeSelect: range_exprs=[RangeFn { expr:AVG(test.field_0 + test.field_1) range:300s fill: }], align=3600s time_index=timestamp [timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, AVG(test.field_0 + test.field_1):Float64;N]\
+            "RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600s time_index=timestamp [timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N]\
            \n  TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -414,11 +440,10 @@ mod test {

    #[tokio::test]
    async fn range_expr_calculation() {
-        let query =
-            r#"SELECT avg(field_0 + field_1)/4 RANGE '5m' FROM test ALIGN '1h' by (tag_0,tag_1);"#;
+        let query = r#"SELECT (avg(field_0 + field_1)/4) RANGE '5m' FROM test ALIGN '1h' by (tag_0,tag_1);"#;
        let expected = String::from(
-            "Projection: AVG(test.field_0 + test.field_1) / Int64(4) [AVG(test.field_0 + test.field_1) / Int64(4):Float64;N]\
-            \n  RangeSelect: range_exprs=[RangeFn { expr:AVG(test.field_0 + test.field_1) range:300s fill: }], align=3600s time_index=timestamp [AVG(test.field_0 + test.field_1):Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            "Projection: AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL / Int64(4) [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL / Int64(4):Float64;N]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600s time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -427,10 +452,10 @@ mod test {
    #[tokio::test]
    async fn range_multi_args() {
        let query =
-            r#"SELECT covar(field_0 + field_1, field_1)/4 RANGE '5m' FROM test ALIGN '1h';"#;
+            r#"SELECT (covar(field_0 + field_1, field_1)/4) RANGE '5m' FROM test ALIGN '1h';"#;
        let expected = String::from(
-            "Projection: COVARIANCE(test.field_0 + test.field_1,test.field_1) / Int64(4) [COVARIANCE(test.field_0 + test.field_1,test.field_1) / Int64(4):Float64;N]\
-            \n  RangeSelect: range_exprs=[RangeFn { expr:COVARIANCE(test.field_0 + test.field_1,test.field_1) range:300s fill: }], align=3600s time_index=timestamp [COVARIANCE(test.field_0 + test.field_1,test.field_1):Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8]\
+            "Projection: COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL / Int64(4) [COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL / Int64(4):Float64;N]\
+            \n  RangeSelect: range_exprs=[COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL], align=3600s time_index=timestamp [COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8]\
            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -438,10 +463,10 @@ mod test {

    #[tokio::test]
    async fn range_calculation() {
-        let query = r#"SELECT (avg(field_0)+sum(field_1))/4 RANGE '5m' FROM test ALIGN '1h' by (tag_0,tag_1) FILL NULL;"#;
+        let query = r#"SELECT ((avg(field_0)+sum(field_1))/4) RANGE '5m' FROM test ALIGN '1h' by (tag_0,tag_1) FILL NULL;"#;
        let expected = String::from(
-            "Projection: (AVG(test.field_0) + SUM(test.field_1)) / Int64(4) [AVG(test.field_0) + SUM(test.field_1) / Int64(4):Float64;N]\
-            \n  RangeSelect: range_exprs=[RangeFn { expr:AVG(test.field_0) range:300s fill:NULL }, RangeFn { expr:SUM(test.field_1) range:300s fill:NULL }], align=3600s time_index=timestamp [AVG(test.field_0):Float64;N, SUM(test.field_1):Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            "Projection: (AVG(test.field_0) RANGE 5m FILL NULL + SUM(test.field_1) RANGE 5m FILL NULL) / Int64(4) [AVG(test.field_0) RANGE 5m FILL NULL + SUM(test.field_1) RANGE 5m FILL NULL / Int64(4):Float64;N]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0) RANGE 5m FILL NULL, SUM(test.field_1) RANGE 5m FILL NULL], align=3600s time_index=timestamp [AVG(test.field_0) RANGE 5m FILL NULL:Float64;N, SUM(test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -449,12 +474,12 @@ mod test {

    #[tokio::test]
    async fn range_as_sub_query() {
-        let query = r#"SELECT foo + 1 from (SELECT (avg(field_0)+sum(field_1))/4 RANGE '5m' as foo FROM test ALIGN '1h' by (tag_0,tag_1) FILL NULL) where foo > 1;"#;
+        let query = r#"SELECT foo + 1 from (SELECT ((avg(field_0)+sum(field_1))/4) RANGE '5m' as foo FROM test ALIGN '1h' by (tag_0,tag_1) FILL NULL) where foo > 1;"#;
        let expected = String::from(
            "Projection: foo + Int64(1) [foo + Int64(1):Float64;N]\
            \n  Filter: foo > Int64(1) [foo:Float64;N]\
-            \n    Projection: (AVG(test.field_0) + SUM(test.field_1)) / Int64(4) AS foo [foo:Float64;N]\
-            \n      RangeSelect: range_exprs=[RangeFn { expr:AVG(test.field_0) range:300s fill:NULL }, RangeFn { expr:SUM(test.field_1) range:300s fill:NULL }], align=3600s time_index=timestamp [AVG(test.field_0):Float64;N, SUM(test.field_1):Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n    Projection: (AVG(test.field_0) RANGE 5m FILL NULL + SUM(test.field_1) RANGE 5m FILL NULL) / Int64(4) AS foo [foo:Float64;N]\
+            \n      RangeSelect: range_exprs=[AVG(test.field_0) RANGE 5m FILL NULL, SUM(test.field_1) RANGE 5m FILL NULL], align=3600s time_index=timestamp [AVG(test.field_0) RANGE 5m FILL NULL:Float64;N, SUM(test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
            \n        TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -462,14 +487,109 @@ mod test {

    #[tokio::test]
    async fn range_from_nest_query() {
-        let query = r#"SELECT (avg(a)+sum(b))/4 RANGE '5m' FROM (SELECT field_0 as a, field_1 as b, tag_0 as c, tag_1 as d, timestamp from test where field_0 > 1.0) ALIGN '1h' by (c, d) FILL NULL;"#;
+        let query = r#"SELECT ((avg(a)+sum(b))/4) RANGE '5m' FROM (SELECT field_0 as a, field_1 as b, tag_0 as c, tag_1 as d, timestamp from test where field_0 > 1.0) ALIGN '1h' by (c, d) FILL NULL;"#;
        let expected = String::from(
-            "Projection: (AVG(a) + SUM(b)) / Int64(4) [AVG(a) + SUM(b) / Int64(4):Float64;N]\
-            \n  RangeSelect: range_exprs=[RangeFn { expr:AVG(a) range:300s fill:NULL }, RangeFn { expr:SUM(b) range:300s fill:NULL }], align=3600s time_index=timestamp [AVG(a):Float64;N, SUM(b):Float64;N, timestamp:Timestamp(Millisecond, None), c:Utf8, d:Utf8]\
+            "Projection: (AVG(a) RANGE 5m FILL NULL + SUM(b) RANGE 5m FILL NULL) / Int64(4) [AVG(a) RANGE 5m FILL NULL + SUM(b) RANGE 5m FILL NULL / Int64(4):Float64;N]\
+            \n  RangeSelect: range_exprs=[AVG(a) RANGE 5m FILL NULL, SUM(b) RANGE 5m FILL NULL], align=3600s time_index=timestamp [AVG(a) RANGE 5m FILL NULL:Float64;N, SUM(b) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), c:Utf8, d:Utf8]\
            \n    Projection: test.field_0 AS a, test.field_1 AS b, test.tag_0 AS c, test.tag_1 AS d, test.timestamp [a:Float64;N, b:Float64;N, c:Utf8, d:Utf8, timestamp:Timestamp(Millisecond, None)]\
            \n      Filter: test.field_0 > Float64(1) [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]\
            \n        TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
+         );
+        query_plan_compare(query, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_in_expr() {
+        let query = r#"SELECT sin(avg(field_0 + field_1) RANGE '5m' + 1) FROM test ALIGN '1h' by (tag_0,tag_1);"#;
+        let expected = String::from(
+            "Projection: sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1)) [sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1)):Float64;N]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600s time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
    }
+
+    #[tokio::test]
+    async fn duplicate_range_expr() {
+        let query = r#"SELECT avg(field_0) RANGE '5m' FILL 6.0 + avg(field_0) RANGE '5m' FILL 6.0 FROM test ALIGN '1h' by (tag_0,tag_1);"#;
+        let expected = String::from(
+            "Projection: AVG(test.field_0) RANGE 5m FILL 6 + AVG(test.field_0) RANGE 5m FILL 6 [AVG(test.field_0) RANGE 5m FILL 6 + AVG(test.field_0) RANGE 5m FILL 6:Float64]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0) RANGE 5m FILL 6], align=3600s time_index=timestamp [AVG(test.field_0) RANGE 5m FILL 6:Float64, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
+        );
+        query_plan_compare(query, expected).await;
+    }
+
+    #[tokio::test]
+    async fn deep_nest_range_expr() {
+        let query = r#"SELECT round(sin(avg(field_0 + field_1) RANGE '5m' + 1)) FROM test ALIGN '1h' by (tag_0,tag_1);"#;
+        let expected = String::from(
+            "Projection: round(sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1))) [round(sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1))):Float64;N]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600s time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
+        );
+        query_plan_compare(query, expected).await;
+    }
+
+    #[tokio::test]
+    async fn complex_range_expr() {
+        let query = r#"SELECT gcd(CAST(max(field_0 + 1) Range '5m' FILL NULL AS Int64), CAST(tag_0 AS Int64)) + round(max(field_2+1) Range '6m' FILL NULL + 1) + max(field_2+3) Range '10m' FILL NULL * CAST(tag_1 AS Float64) + 1 FROM test ALIGN '1h' by (tag_0, tag_1);"#;
+        let expected = String::from(
+            "Projection: gcd(CAST(MAX(test.field_0 + Int64(1)) RANGE 5m FILL NULL AS Int64), CAST(test.tag_0 AS Int64)) + round(MAX(test.field_2 + Int64(1)) RANGE 6m FILL NULL + Int64(1)) + MAX(test.field_2 + Int64(3)) RANGE 10m FILL NULL * CAST(test.tag_1 AS Float64) + Int64(1) [gcd(MAX(test.field_0 + Int64(1)) RANGE 5m FILL NULL,test.tag_0) + round(MAX(test.field_2 + Int64(1)) RANGE 6m FILL NULL + Int64(1)) + MAX(test.field_2 + Int64(3)) RANGE 10m FILL NULL * test.tag_1 + Int64(1):Float64;N]\
+            \n  RangeSelect: range_exprs=[MAX(test.field_0 + Int64(1)) RANGE 5m FILL NULL, MAX(test.field_2 + Int64(1)) RANGE 6m FILL NULL, MAX(test.field_2 + Int64(3)) RANGE 10m FILL NULL], align=3600s time_index=timestamp [MAX(test.field_0 + Int64(1)) RANGE 5m FILL NULL:Float64;N, MAX(test.field_2 + Int64(1)) RANGE 6m FILL NULL:Float64;N, MAX(test.field_2 + Int64(3)) RANGE 10m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
+        );
+        query_plan_compare(query, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_linear_on_integer() {
+        let query = r#"SELECT min(CAST(field_0 AS Int64) + CAST(field_1 AS Int64)) RANGE '5m' FILL LINEAR FROM test ALIGN '1h' by (tag_0,tag_1);"#;
+        let expected = String::from(
+            "RangeSelect: range_exprs=[MIN(test.field_0 + test.field_1) RANGE 5m FILL LINEAR], align=3600s time_index=timestamp [MIN(test.field_0 + test.field_1) RANGE 5m FILL LINEAR:Float64;N]\
+            \n  TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
+        );
+        query_plan_compare(query, expected).await;
+    }
+
+    #[tokio::test]
+    async fn range_nest_range_err() {
+        let query = r#"SELECT sum(avg(field_0 + field_1) RANGE '5m' + 1) RANGE '5m' + 1 FROM test ALIGN '1h' by (tag_0,tag_1);"#;
+        assert_eq!(
+            do_query(query).await.unwrap_err().to_string(),
+            "Range Query: Nest Range Query is not allowed"
+        )
+    }
+
+    #[tokio::test]
+    /// Start directly from the rewritten SQL and check whether the error reported by the range expression rewriting is as expected.
+    /// the right argument is `range_fn(avg(field_0), '5m', 'NULL', '0', '1h')`
+    async fn range_argument_err_1() {
+        let query = r#"SELECT range_fn('5m', avg(field_0), 'NULL', '1', tag_0, '1h') FROM test group by tag_0;"#;
+        let error = do_query(query)
+            .await
+            .unwrap_err()
+            .source()
+            .unwrap()
+            .to_string();
+        assert_eq!(
+            error,
+            "Error during planning: Illegal argument `Utf8(\"5m\")` in range select query"
+        )
+    }
+
+    #[tokio::test]
+    async fn range_argument_err_2() {
+        let query = r#"SELECT range_fn(avg(field_0), 5, 'NULL', '1', tag_0, '1h') FROM test group by tag_0;"#;
+        let error = do_query(query)
+            .await
+            .unwrap_err()
+            .source()
+            .unwrap()
+            .to_string();
+        assert_eq!(
+            error,
+            "Error during planning: Illegal argument `Int64(5)` in range select query"
+        )
+    }
 }
--- a/src/servers/src/error.rs
+++ b/src/servers/src/error.rs
@@ -392,7 +392,6 @@ impl ErrorExt for Error {
            Internal { .. }
            | InternalIo { .. }
            | TokioIo { .. }
-            | CollectRecordbatch { .. }
            | StartHttp { .. }
            | StartGrpc { .. }
            | AlreadyStarted { .. }
@@ -403,6 +402,8 @@ impl ErrorExt for Error {
            | GrpcReflectionService { .. }
            | BuildHttpResponse { .. } => StatusCode::Internal,

+            CollectRecordbatch { .. } => StatusCode::EngineExecuteQuery,
+
            InsertScript { source, .. }
            | ExecuteScript { source, .. }
            | ExecuteQuery { source, .. }
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -660,6 +660,7 @@ impl HttpServer {
    fn route_otlp<S>(&self, otlp_handler: OpenTelemetryProtocolHandlerRef) -> Router<S> {
        Router::new()
            .route("/v1/metrics", routing::post(otlp::metrics))
+            .route("/v1/traces", routing::post(otlp::traces))
            .with_state(otlp_handler)
    }

--- a/src/servers/src/http/opentsdb.rs
+++ b/src/servers/src/http/opentsdb.rs
@@ -84,17 +84,19 @@ pub async fn put(
    let summary = params.contains_key("summary");
    let details = params.contains_key("details");

-    let data_points = parse_data_points(body).await?;
+    let data_point_requests = parse_data_points(body).await?;
+    let data_points = data_point_requests
+        .iter()
+        .map(|point| point.clone().into())
+        .collect::<Vec<_>>();

    let response = if !summary && !details {
-        for data_point in data_points.into_iter() {
-            if let Err(e) = opentsdb_handler.exec(&data_point.into(), ctx.clone()).await {
-                // Not debugging purpose, failed fast.
-                return error::InternalSnafu {
-                    err_msg: e.to_string(),
-                }
-                .fail();
+        if let Err(e) = opentsdb_handler.exec(data_points, ctx.clone()).await {
+            // Not debugging purpose, failed fast.
+            return error::InternalSnafu {
+                err_msg: e.to_string(),
            }
+            .fail();
        }
        (HttpStatusCode::NO_CONTENT, Json(OpentsdbPutResponse::Empty))
    } else {
@@ -108,15 +110,11 @@ pub async fn put(
            },
        };

-        for data_point in data_points.into_iter() {
-            let result = opentsdb_handler
-                .exec(&data_point.clone().into(), ctx.clone())
-                .await;
+        for (data_point, request) in data_points.into_iter().zip(data_point_requests) {
+            let result = opentsdb_handler.exec(vec![data_point], ctx.clone()).await;
            match result {
-                Ok(()) => response.on_success(),
-                Err(e) => {
-                    response.on_failed(data_point, e);
-                }
+                Ok(affected_rows) => response.on_success(affected_rows),
+                Err(e) => response.on_failed(request, e),
            }
        }
        (
@@ -151,8 +149,8 @@ pub struct OpentsdbDebuggingResponse {
 }

 impl OpentsdbDebuggingResponse {
-    fn on_success(&mut self) {
-        self.success += 1;
+    fn on_success(&mut self, affected_rows: usize) {
+        self.success += affected_rows as i32;
    }

    fn on_failed(&mut self, datapoint: DataPointRequest, error: impl ErrorExt) {
--- a/src/servers/src/http/otlp.rs
+++ b/src/servers/src/http/otlp.rs
@@ -21,6 +21,9 @@ use hyper::Body;
 use opentelemetry_proto::tonic::collector::metrics::v1::{
    ExportMetricsServiceRequest, ExportMetricsServiceResponse,
 };
+use opentelemetry_proto::tonic::collector::trace::v1::{
+    ExportTraceServiceRequest, ExportTraceServiceResponse,
+};
 use prost::Message;
 use session::context::QueryContextRef;
 use snafu::prelude::*;
@@ -33,16 +36,19 @@ pub async fn metrics(
    State(handler): State<OpenTelemetryProtocolHandlerRef>,
    Extension(query_ctx): Extension<QueryContextRef>,
    RawBody(body): RawBody,
-) -> Result<OtlpResponse> {
+) -> Result<OtlpMetricsResponse> {
    let _timer = timer!(
-        crate::metrics::METRIC_HTTP_OPENTELEMETRY_ELAPSED,
+        crate::metrics::METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED,
        &[(crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string())]
    );
-    let request = parse_body(body).await?;
-    handler.metrics(request, query_ctx).await.map(OtlpResponse)
+    let request = parse_metrics_body(body).await?;
+    handler
+        .metrics(request, query_ctx)
+        .await
+        .map(OtlpMetricsResponse)
 }

-async fn parse_body(body: Body) -> Result<ExportMetricsServiceRequest> {
+async fn parse_metrics_body(body: Body) -> Result<ExportMetricsServiceRequest> {
    hyper::body::to_bytes(body)
        .await
        .context(error::HyperSnafu)
@@ -51,9 +57,47 @@ async fn parse_body(body: Body) -> Result<ExportMetricsServiceRequest> {
        })
 }

-pub struct OtlpResponse(ExportMetricsServiceResponse);
+pub struct OtlpMetricsResponse(ExportMetricsServiceResponse);

-impl IntoResponse for OtlpResponse {
+impl IntoResponse for OtlpMetricsResponse {
+    fn into_response(self) -> axum::response::Response {
+        (
+            [(header::CONTENT_TYPE, "application/x-protobuf")],
+            self.0.encode_to_vec(),
+        )
+            .into_response()
+    }
+}
+
+#[axum_macros::debug_handler]
+pub async fn traces(
+    State(handler): State<OpenTelemetryProtocolHandlerRef>,
+    Extension(query_ctx): Extension<QueryContextRef>,
+    RawBody(body): RawBody,
+) -> Result<OtlpTracesResponse> {
+    let _timer = timer!(
+        crate::metrics::METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED,
+        &[(crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string())]
+    );
+    let request = parse_traces_body(body).await?;
+    handler
+        .traces(request, query_ctx)
+        .await
+        .map(OtlpTracesResponse)
+}
+
+async fn parse_traces_body(body: Body) -> Result<ExportTraceServiceRequest> {
+    hyper::body::to_bytes(body)
+        .await
+        .context(error::HyperSnafu)
+        .and_then(|buf| {
+            ExportTraceServiceRequest::decode(&buf[..]).context(error::DecodeOtlpRequestSnafu)
+        })
+}
+
+pub struct OtlpTracesResponse(ExportTraceServiceResponse);
+
+impl IntoResponse for OtlpTracesResponse {
    fn into_response(self) -> axum::response::Response {
        (
            [(header::CONTENT_TYPE, "application/x-protobuf")],
--- a/src/servers/src/metrics.rs
+++ b/src/servers/src/metrics.rs
@@ -37,7 +37,10 @@ pub(crate) const METRIC_HTTP_INFLUXDB_WRITE_ELAPSED: &str = "servers.http_influx
 pub(crate) const METRIC_HTTP_PROM_STORE_WRITE_ELAPSED: &str =
    "servers.http_prometheus_write_elapsed";
 pub(crate) const METRIC_HTTP_PROM_STORE_READ_ELAPSED: &str = "servers.http_prometheus_read_elapsed";
-pub(crate) const METRIC_HTTP_OPENTELEMETRY_ELAPSED: &str = "servers.http_otlp_elapsed";
+pub(crate) const METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED: &str =
+    "servers.http_otlp_metrics_elapsed";
+pub(crate) const METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED: &str =
+    "servers.http_otlp_traces_elapsed";
 pub(crate) const METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED: &str =
    "servers.opentsdb_line_write_elapsed";
 pub(crate) const METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED: &str =
--- a/src/servers/src/mysql/writer.rs
+++ b/src/servers/src/mysql/writer.rs
@@ -14,7 +14,7 @@

 use std::ops::Deref;

-use common_error::ext::{BoxedError, ErrorExt};
+use common_error::ext::ErrorExt;
 use common_query::Output;
 use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
 use datatypes::prelude::{ConcreteDataType, Value};
@@ -28,7 +28,7 @@ use session::context::QueryContextRef;
 use snafu::prelude::*;
 use tokio::io::AsyncWrite;

-use crate::error::{self, Error, OtherSnafu, Result};
+use crate::error::{self, Error, Result};
 use crate::metrics::*;

 /// Try to write multiple output to the writer if possible.
@@ -148,7 +148,11 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
                            .await?
                        }
                        Err(e) => {
-                            return Err(e).map_err(BoxedError::new).context(OtherSnafu);
+                            let err = e.to_string();
+                            row_writer
+                                .finish_error(ErrorKind::ER_INTERNAL_ERROR, &err.as_bytes())
+                                .await?;
+                            return Ok(());
                        }
                    }
                }
--- a/src/servers/src/opentsdb.rs
+++ b/src/servers/src/opentsdb.rs
@@ -20,16 +20,20 @@ use std::future::Future;
 use std::net::SocketAddr;
 use std::sync::Arc;

+use api::v1::RowInsertRequests;
 use async_trait::async_trait;
 use common_runtime::Runtime;
 use common_telemetry::logging::error;
 use futures::StreamExt;
 use tokio::sync::broadcast;

+use self::codec::DataPoint;
 use crate::error::Result;
 use crate::opentsdb::connection::Connection;
 use crate::opentsdb::handler::Handler;
+use crate::prom_store::{FIELD_COLUMN_NAME, TIMESTAMP_COLUMN_NAME};
 use crate::query_handler::OpentsdbProtocolHandlerRef;
+use crate::row_writer::{self, MultiTableData};
 use crate::server::{AbortableStream, BaseTcpServer, Server};
 use crate::shutdown::Shutdown;

@@ -126,3 +130,38 @@ impl Server for OpentsdbServer {
        OPENTSDB_SERVER
    }
 }
+
+pub fn data_point_to_grpc_row_insert_requests(
+    data_points: Vec<DataPoint>,
+) -> Result<(RowInsertRequests, usize)> {
+    let mut multi_table_data = MultiTableData::new();
+
+    for mut data_point in data_points {
+        let tags: Vec<(String, String)> = std::mem::take(data_point.tags_mut());
+        let table_name = data_point.metric();
+        let value = data_point.value();
+        let timestamp = data_point.ts_millis();
+        // length of tags + 2 extra columns for greptime_timestamp and the value
+        let num_columns = tags.len() + 2;
+
+        let table_data = multi_table_data.get_or_default_table_data(table_name, num_columns, 1);
+        let mut one_row = table_data.alloc_one_row();
+
+        // tags
+        row_writer::write_tags(table_data, tags.into_iter(), &mut one_row)?;
+
+        // value
+        row_writer::write_f64(table_data, FIELD_COLUMN_NAME, value, &mut one_row)?;
+        // timestamp
+        row_writer::write_ts_millis(
+            table_data,
+            TIMESTAMP_COLUMN_NAME,
+            Some(timestamp),
+            &mut one_row,
+        )?;
+
+        table_data.add_row(one_row);
+    }
+
+    Ok(multi_table_data.into_row_insert_requests())
+}
--- a/src/servers/src/opentsdb/codec.rs
+++ b/src/servers/src/opentsdb/codec.rs
@@ -19,7 +19,7 @@ use crate::error::{self, Result};
 pub const OPENTSDB_TIMESTAMP_COLUMN_NAME: &str = "greptime_timestamp";
 pub const OPENTSDB_FIELD_COLUMN_NAME: &str = "greptime_value";

-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct DataPoint {
    metric: String,
    ts_millis: i64,
@@ -115,6 +115,10 @@ impl DataPoint {
        &self.tags
    }

+    pub fn tags_mut(&mut self) -> &mut Vec<(String, String)> {
+        &mut self.tags
+    }
+
    pub fn ts_millis(&self) -> i64 {
        self.ts_millis
    }
--- a/src/servers/src/opentsdb/handler.rs
+++ b/src/servers/src/opentsdb/handler.rs
@@ -94,7 +94,7 @@ impl<S: AsyncWrite + AsyncRead + Unpin> Handler<S> {
            match DataPoint::try_create(&line) {
                Ok(data_point) => {
                    let _timer = timer!(crate::metrics::METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED);
-                    let result = self.query_handler.exec(&data_point, ctx.clone()).await;
+                    let result = self.query_handler.exec(vec![data_point], ctx.clone()).await;
                    if let Err(e) = result {
                        self.connection.write_line(e.output_msg()).await?;
                    }
@@ -128,8 +128,8 @@ mod tests {

    #[async_trait]
    impl OpentsdbProtocolHandler for DummyQueryHandler {
-        async fn exec(&self, data_point: &DataPoint, _ctx: QueryContextRef) -> Result<()> {
-            let metric = data_point.metric();
+        async fn exec(&self, data_points: Vec<DataPoint>, _ctx: QueryContextRef) -> Result<usize> {
+            let metric = data_points.first().unwrap().metric();
            if metric == "should_failed" {
                return error::InternalSnafu {
                    err_msg: "expected",
@@ -137,7 +137,7 @@ mod tests {
                .fail();
            }
            self.tx.send(metric.to_string()).await.unwrap();
-            Ok(())
+            Ok(data_points.len())
        }
    }

@@ -169,7 +169,7 @@ mod tests {
            .await
            .unwrap();
        let resp = client.read_line().await.unwrap();
-        assert_eq!(resp, Some("Internal error: expected".to_string()));
+        assert_eq!(resp, Some("Internal error: 1003".to_string()));

        client.write_line("get".to_string()).await.unwrap();
        let resp = client.read_line().await.unwrap();
--- a/src/servers/src/otlp.rs
+++ b/src/servers/src/otlp.rs
@@ -12,649 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use api::v1::{RowInsertRequests, Value};
-use common_grpc::writer::Precision;
-use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
-use opentelemetry_proto::tonic::common::v1::{any_value, KeyValue};
-use opentelemetry_proto::tonic::metrics::v1::{metric, number_data_point, *};
-
-use crate::error::Result;
-use crate::row_writer::{self, MultiTableData, TableData};
+pub mod metrics;
+pub mod plugin;
+pub mod trace;

 const GREPTIME_TIMESTAMP: &str = "greptime_timestamp";
 const GREPTIME_VALUE: &str = "greptime_value";
 const GREPTIME_COUNT: &str = "greptime_count";
-/// the default column count for table writer
-const APPROXIMATE_COLUMN_COUNT: usize = 8;
-
-/// Normalize otlp instrumentation, metric and attribute names
-///
-/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/api.md#instrument-name-syntax>
-/// - since the name are case-insensitive, we transform them to lowercase for
-/// better sql usability
-/// - replace `.` and `-` with `_`
-fn normalize_otlp_name(name: &str) -> String {
-    name.to_lowercase().replace(|c| c == '.' || c == '-', "_")
-}
-
-/// Convert OpenTelemetry metrics to GreptimeDB insert requests
-///
-/// See
-/// <https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/metrics/v1/metrics.proto#L162>
-/// for data structure of OTLP metrics.
-///
-/// Returns `InsertRequests` and total number of rows to ingest
-pub fn to_grpc_insert_requests(
-    request: ExportMetricsServiceRequest,
-) -> Result<(RowInsertRequests, usize)> {
-    let mut table_writer = MultiTableData::default();
-
-    for resource in &request.resource_metrics {
-        let resource_attrs = resource.resource.as_ref().map(|r| &r.attributes);
-        for scope in &resource.scope_metrics {
-            let scope_attrs = scope.scope.as_ref().map(|s| &s.attributes);
-            for metric in &scope.metrics {
-                encode_metrics(&mut table_writer, metric, resource_attrs, scope_attrs)?;
-            }
-        }
-    }
-
-    Ok(table_writer.into_row_insert_requests())
-}
-
-fn encode_metrics(
-    table_writer: &mut MultiTableData,
-    metric: &Metric,
-    resource_attrs: Option<&Vec<KeyValue>>,
-    scope_attrs: Option<&Vec<KeyValue>>,
-) -> Result<()> {
-    let name = &metric.name;
-    // note that we don't store description or unit, we might want to deal with
-    // these fields in the future.
-    if let Some(data) = &metric.data {
-        match data {
-            metric::Data::Gauge(gauge) => {
-                encode_gauge(table_writer, name, gauge, resource_attrs, scope_attrs)?;
-            }
-            metric::Data::Sum(sum) => {
-                encode_sum(table_writer, name, sum, resource_attrs, scope_attrs)?;
-            }
-            metric::Data::Summary(summary) => {
-                encode_summary(table_writer, name, summary, resource_attrs, scope_attrs)?;
-            }
-            metric::Data::Histogram(hist) => {
-                encode_histogram(table_writer, name, hist, resource_attrs, scope_attrs)?;
-            }
-            // TODO(sunng87) leave ExponentialHistogram for next release
-            metric::Data::ExponentialHistogram(_hist) => {}
-        }
-    }
-
-    Ok(())
-}
-
-fn write_attributes(
-    writer: &mut TableData,
-    row: &mut Vec<Value>,
-    attrs: Option<&Vec<KeyValue>>,
-) -> Result<()> {
-    if let Some(attrs) = attrs {
-        let table_tags = attrs.iter().filter_map(|attr| {
-            if let Some(val) = attr.value.as_ref().and_then(|v| v.value.as_ref()) {
-                let key = normalize_otlp_name(&attr.key);
-                match val {
-                    any_value::Value::StringValue(s) => Some((key, s.to_string())),
-                    any_value::Value::IntValue(v) => Some((key, v.to_string())),
-                    any_value::Value::DoubleValue(v) => Some((key, v.to_string())),
-                    _ => None, // TODO(sunng87): allow different type of values
-                }
-            } else {
-                None
-            }
-        });
-
-        row_writer::write_tags(writer, table_tags, row)?;
-    }
-    Ok(())
-}
-
-fn write_timestamp(table: &mut TableData, row: &mut Vec<Value>, time_nano: i64) -> Result<()> {
-    row_writer::write_ts_precision(
-        table,
-        GREPTIME_TIMESTAMP,
-        Some(time_nano),
-        Precision::Nanosecond,
-        row,
-    )
-}
-
-fn write_data_point_value(
-    table: &mut TableData,
-    row: &mut Vec<Value>,
-    field: &str,
-    value: &Option<number_data_point::Value>,
-) -> Result<()> {
-    match value {
-        Some(number_data_point::Value::AsInt(val)) => {
-            // we coerce all values to f64
-            row_writer::write_f64(table, field, *val as f64, row)?;
-        }
-        Some(number_data_point::Value::AsDouble(val)) => {
-            row_writer::write_f64(table, field, *val, row)?;
-        }
-        _ => {}
-    }
-    Ok(())
-}
-
-fn write_tags_and_timestamp(
-    table: &mut TableData,
-    row: &mut Vec<Value>,
-    resource_attrs: Option<&Vec<KeyValue>>,
-    scope_attrs: Option<&Vec<KeyValue>>,
-    data_point_attrs: Option<&Vec<KeyValue>>,
-    timestamp_nanos: i64,
-) -> Result<()> {
-    write_attributes(table, row, resource_attrs)?;
-    write_attributes(table, row, scope_attrs)?;
-    write_attributes(table, row, data_point_attrs)?;
-
-    write_timestamp(table, row, timestamp_nanos)?;
-
-    Ok(())
-}
-
-/// encode this gauge metric
-///
-/// note that there can be multiple data points in the request, it's going to be
-/// stored as multiple rows
-fn encode_gauge(
-    table_writer: &mut MultiTableData,
-    name: &str,
-    gauge: &Gauge,
-    resource_attrs: Option<&Vec<KeyValue>>,
-    scope_attrs: Option<&Vec<KeyValue>>,
-) -> Result<()> {
-    let table = table_writer.get_or_default_table_data(
-        &normalize_otlp_name(name),
-        APPROXIMATE_COLUMN_COUNT,
-        gauge.data_points.len(),
-    );
-
-    for data_point in &gauge.data_points {
-        let mut row = table.alloc_one_row();
-        write_tags_and_timestamp(
-            table,
-            &mut row,
-            resource_attrs,
-            scope_attrs,
-            Some(data_point.attributes.as_ref()),
-            data_point.time_unix_nano as i64,
-        )?;
-
-        write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?;
-        table.add_row(row);
-    }
-
-    Ok(())
-}
-
-/// encode this sum metric
-///
-/// `aggregation_temporality` and `monotonic` are ignored for now
-fn encode_sum(
-    table_writer: &mut MultiTableData,
-    name: &str,
-    sum: &Sum,
-    resource_attrs: Option<&Vec<KeyValue>>,
-    scope_attrs: Option<&Vec<KeyValue>>,
-) -> Result<()> {
-    let table = table_writer.get_or_default_table_data(
-        &normalize_otlp_name(name),
-        APPROXIMATE_COLUMN_COUNT,
-        sum.data_points.len(),
-    );
-
-    for data_point in &sum.data_points {
-        let mut row = table.alloc_one_row();
-        write_tags_and_timestamp(
-            table,
-            &mut row,
-            resource_attrs,
-            scope_attrs,
-            Some(data_point.attributes.as_ref()),
-            data_point.time_unix_nano as i64,
-        )?;
-        write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?;
-        table.add_row(row);
-    }
-
-    Ok(())
-}
-
-const HISTOGRAM_LE_COLUMN: &str = "le";
-
-/// Encode histogram data. This function returns 3 insert requests for 3 tables.
-///
-/// The implementation has been following Prometheus histogram table format:
-///
-/// - A `%metric%_bucket` table including `greptime_le` tag that stores bucket upper
-/// limit, and `greptime_value` for bucket count
-/// - A `%metric%_sum` table storing sum of samples
-/// -  A `%metric%_count` table storing count of samples.
-///
-/// By its Prometheus compatibility, we hope to be able to use prometheus
-/// quantile functions on this table.
-fn encode_histogram(
-    table_writer: &mut MultiTableData,
-    name: &str,
-    hist: &Histogram,
-    resource_attrs: Option<&Vec<KeyValue>>,
-    scope_attrs: Option<&Vec<KeyValue>>,
-) -> Result<()> {
-    let normalized_name = normalize_otlp_name(name);
-
-    let bucket_table_name = format!("{}_bucket", normalized_name);
-    let sum_table_name = format!("{}_sum", normalized_name);
-    let count_table_name = format!("{}_count", normalized_name);
-
-    let data_points_len = hist.data_points.len();
-    // Note that the row and columns number here is approximate
-    let mut bucket_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len * 3);
-    let mut sum_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len);
-    let mut count_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len);
-
-    for data_point in &hist.data_points {
-        let mut accumulated_count = 0;
-        for (idx, count) in data_point.bucket_counts.iter().enumerate() {
-            let mut bucket_row = bucket_table.alloc_one_row();
-            write_tags_and_timestamp(
-                &mut bucket_table,
-                &mut bucket_row,
-                resource_attrs,
-                scope_attrs,
-                Some(data_point.attributes.as_ref()),
-                data_point.time_unix_nano as i64,
-            )?;
-
-            if let Some(upper_bounds) = data_point.explicit_bounds.get(idx) {
-                row_writer::write_tag(
-                    &mut bucket_table,
-                    HISTOGRAM_LE_COLUMN,
-                    upper_bounds,
-                    &mut bucket_row,
-                )?;
-            } else if idx == data_point.explicit_bounds.len() {
-                // The last bucket
-                row_writer::write_tag(
-                    &mut bucket_table,
-                    HISTOGRAM_LE_COLUMN,
-                    f64::INFINITY,
-                    &mut bucket_row,
-                )?;
-            }
-
-            accumulated_count += count;
-            row_writer::write_f64(
-                &mut bucket_table,
-                GREPTIME_VALUE,
-                accumulated_count as f64,
-                &mut bucket_row,
-            )?;
-
-            bucket_table.add_row(bucket_row);
-        }
-
-        if let Some(sum) = data_point.sum {
-            let mut sum_row = sum_table.alloc_one_row();
-            write_tags_and_timestamp(
-                &mut sum_table,
-                &mut sum_row,
-                resource_attrs,
-                scope_attrs,
-                Some(data_point.attributes.as_ref()),
-                data_point.time_unix_nano as i64,
-            )?;
-
-            row_writer::write_f64(&mut sum_table, GREPTIME_VALUE, sum, &mut sum_row)?;
-            sum_table.add_row(sum_row);
-        }
-
-        let mut count_row = count_table.alloc_one_row();
-        write_tags_and_timestamp(
-            &mut count_table,
-            &mut count_row,
-            resource_attrs,
-            scope_attrs,
-            Some(data_point.attributes.as_ref()),
-            data_point.time_unix_nano as i64,
-        )?;
-
-        row_writer::write_f64(
-            &mut count_table,
-            GREPTIME_VALUE,
-            data_point.count as f64,
-            &mut count_row,
-        )?;
-        count_table.add_row(count_row);
-    }
-
-    table_writer.add_table_data(bucket_table_name, bucket_table);
-    table_writer.add_table_data(sum_table_name, sum_table);
-    table_writer.add_table_data(count_table_name, count_table);
-
-    Ok(())
-}
-
-#[allow(dead_code)]
-fn encode_exponential_histogram(_name: &str, _hist: &ExponentialHistogram) -> Result<()> {
-    // TODO(sunng87): implement this using a prometheus compatible way
-    Ok(())
-}
-
-fn encode_summary(
-    table_writer: &mut MultiTableData,
-    name: &str,
-    summary: &Summary,
-    resource_attrs: Option<&Vec<KeyValue>>,
-    scope_attrs: Option<&Vec<KeyValue>>,
-) -> Result<()> {
-    let table = table_writer.get_or_default_table_data(
-        &normalize_otlp_name(name),
-        APPROXIMATE_COLUMN_COUNT,
-        summary.data_points.len(),
-    );
-
-    for data_point in &summary.data_points {
-        let mut row = table.alloc_one_row();
-        write_tags_and_timestamp(
-            table,
-            &mut row,
-            resource_attrs,
-            scope_attrs,
-            Some(data_point.attributes.as_ref()),
-            data_point.time_unix_nano as i64,
-        )?;
-
-        for quantile in &data_point.quantile_values {
-            row_writer::write_f64(
-                table,
-                &format!("greptime_p{:02}", quantile.quantile * 100f64),
-                quantile.value,
-                &mut row,
-            )?;
-        }
-
-        row_writer::write_f64(table, GREPTIME_COUNT, data_point.count as f64, &mut row)?;
-        table.add_row(row);
-    }
-
-    Ok(())
-}
-
-#[cfg(test)]
-mod tests {
-    use opentelemetry_proto::tonic::common::v1::any_value::Value as Val;
-    use opentelemetry_proto::tonic::common::v1::{AnyValue, KeyValue};
-    use opentelemetry_proto::tonic::metrics::v1::number_data_point::Value;
-    use opentelemetry_proto::tonic::metrics::v1::summary_data_point::ValueAtQuantile;
-    use opentelemetry_proto::tonic::metrics::v1::{HistogramDataPoint, NumberDataPoint};
-
-    use super::*;
-
-    #[test]
-    fn test_normalize_otlp_name() {
-        assert_eq!(normalize_otlp_name("jvm.memory.free"), "jvm_memory_free");
-        assert_eq!(normalize_otlp_name("jvm-memory-free"), "jvm_memory_free");
-        assert_eq!(normalize_otlp_name("jvm_memory_free"), "jvm_memory_free");
-        assert_eq!(normalize_otlp_name("JVM_MEMORY_FREE"), "jvm_memory_free");
-        assert_eq!(normalize_otlp_name("JVM_memory_FREE"), "jvm_memory_free");
-    }
-
-    fn keyvalue(key: &str, value: &str) -> KeyValue {
-        KeyValue {
-            key: key.into(),
-            value: Some(AnyValue {
-                value: Some(Val::StringValue(value.into())),
-            }),
-        }
-    }
-
-    #[test]
-    fn test_encode_gauge() {
-        let mut tables = MultiTableData::default();
-
-        let data_points = vec![
-            NumberDataPoint {
-                attributes: vec![keyvalue("host", "testsevrer")],
-                time_unix_nano: 100,
-                value: Some(Value::AsInt(100)),
-                ..Default::default()
-            },
-            NumberDataPoint {
-                attributes: vec![keyvalue("host", "testserver")],
-                time_unix_nano: 105,
-                value: Some(Value::AsInt(105)),
-                ..Default::default()
-            },
-        ];
-        let gauge = Gauge { data_points };
-        encode_gauge(
-            &mut tables,
-            "datamon",
-            &gauge,
-            Some(&vec![keyvalue("resource", "app")]),
-            Some(&vec![keyvalue("scope", "otel")]),
-        )
-        .unwrap();
-
-        let table = tables.get_or_default_table_data("datamon", 0, 0);
-        assert_eq!(table.num_rows(), 2);
-        assert_eq!(table.num_columns(), 5);
-        assert_eq!(
-            table
-                .columns()
-                .iter()
-                .map(|c| &c.column_name)
-                .collect::<Vec<&String>>(),
-            vec![
-                "resource",
-                "scope",
-                "host",
-                "greptime_timestamp",
-                "greptime_value"
-            ]
-        );
-    }
-
-    #[test]
-    fn test_encode_sum() {
-        let mut tables = MultiTableData::default();
-
-        let data_points = vec![
-            NumberDataPoint {
-                attributes: vec![keyvalue("host", "testserver")],
-                time_unix_nano: 100,
-                value: Some(Value::AsInt(100)),
-                ..Default::default()
-            },
-            NumberDataPoint {
-                attributes: vec![keyvalue("host", "testserver")],
-                time_unix_nano: 105,
-                value: Some(Value::AsInt(0)),
-                ..Default::default()
-            },
-        ];
-        let sum = Sum {
-            data_points,
-            ..Default::default()
-        };
-        encode_sum(
-            &mut tables,
-            "datamon",
-            &sum,
-            Some(&vec![keyvalue("resource", "app")]),
-            Some(&vec![keyvalue("scope", "otel")]),
-        )
-        .unwrap();
-
-        let table = tables.get_or_default_table_data("datamon", 0, 0);
-        assert_eq!(table.num_rows(), 2);
-        assert_eq!(table.num_columns(), 5);
-        assert_eq!(
-            table
-                .columns()
-                .iter()
-                .map(|c| &c.column_name)
-                .collect::<Vec<&String>>(),
-            vec![
-                "resource",
-                "scope",
-                "host",
-                "greptime_timestamp",
-                "greptime_value"
-            ]
-        );
-    }
-
-    #[test]
-    fn test_encode_summary() {
-        let mut tables = MultiTableData::default();
-
-        let data_points = vec![SummaryDataPoint {
-            attributes: vec![keyvalue("host", "testserver")],
-            time_unix_nano: 100,
-            count: 25,
-            sum: 5400.0,
-            quantile_values: vec![
-                ValueAtQuantile {
-                    quantile: 0.90,
-                    value: 1000.0,
-                },
-                ValueAtQuantile {
-                    quantile: 0.95,
-                    value: 3030.0,
-                },
-            ],
-            ..Default::default()
-        }];
-        let summary = Summary { data_points };
-        encode_summary(
-            &mut tables,
-            "datamon",
-            &summary,
-            Some(&vec![keyvalue("resource", "app")]),
-            Some(&vec![keyvalue("scope", "otel")]),
-        )
-        .unwrap();
-
-        let table = tables.get_or_default_table_data("datamon", 0, 0);
-        assert_eq!(table.num_rows(), 1);
-        assert_eq!(table.num_columns(), 7);
-        assert_eq!(
-            table
-                .columns()
-                .iter()
-                .map(|c| &c.column_name)
-                .collect::<Vec<&String>>(),
-            vec![
-                "resource",
-                "scope",
-                "host",
-                "greptime_timestamp",
-                "greptime_p90",
-                "greptime_p95",
-                "greptime_count"
-            ]
-        );
-    }
-
-    #[test]
-    fn test_encode_histogram() {
-        let mut tables = MultiTableData::default();
-
-        let data_points = vec![HistogramDataPoint {
-            attributes: vec![keyvalue("host", "testserver")],
-            time_unix_nano: 100,
-            start_time_unix_nano: 23,
-            count: 25,
-            sum: Some(100.),
-            max: Some(200.),
-            min: Some(0.03),
-            bucket_counts: vec![2, 4, 6, 9, 4],
-            explicit_bounds: vec![0.1, 1., 10., 100.],
-            ..Default::default()
-        }];
-
-        let histogram = Histogram {
-            data_points,
-            aggregation_temporality: AggregationTemporality::Delta.into(),
-        };
-        encode_histogram(
-            &mut tables,
-            "histo",
-            &histogram,
-            Some(&vec![keyvalue("resource", "app")]),
-            Some(&vec![keyvalue("scope", "otel")]),
-        )
-        .unwrap();
-
-        assert_eq!(3, tables.num_tables());
-
-        // bucket table
-        let bucket_table = tables.get_or_default_table_data("histo_bucket", 0, 0);
-        assert_eq!(bucket_table.num_rows(), 5);
-        assert_eq!(bucket_table.num_columns(), 6);
-        assert_eq!(
-            bucket_table
-                .columns()
-                .iter()
-                .map(|c| &c.column_name)
-                .collect::<Vec<&String>>(),
-            vec![
-                "resource",
-                "scope",
-                "host",
-                "greptime_timestamp",
-                "le",
-                "greptime_value",
-            ]
-        );
-
-        let sum_table = tables.get_or_default_table_data("histo_sum", 0, 0);
-        assert_eq!(sum_table.num_rows(), 1);
-        assert_eq!(sum_table.num_columns(), 5);
-        assert_eq!(
-            sum_table
-                .columns()
-                .iter()
-                .map(|c| &c.column_name)
-                .collect::<Vec<&String>>(),
-            vec![
-                "resource",
-                "scope",
-                "host",
-                "greptime_timestamp",
-                "greptime_value",
-            ]
-        );
-
-        let count_table = tables.get_or_default_table_data("histo_count", 0, 0);
-        assert_eq!(count_table.num_rows(), 1);
-        assert_eq!(count_table.num_columns(), 5);
-        assert_eq!(
-            count_table
-                .columns()
-                .iter()
-                .map(|c| &c.column_name)
-                .collect::<Vec<&String>>(),
-            vec![
-                "resource",
-                "scope",
-                "host",
-                "greptime_timestamp",
-                "greptime_value",
-            ]
-        );
-    }
-}
--- a/src/servers/src/otlp/metrics.rs
+++ b/src/servers/src/otlp/metrics.rs
@@ -0,0 +1,658 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::{RowInsertRequests, Value};
+use common_grpc::writer::Precision;
+use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
+use opentelemetry_proto::tonic::common::v1::{any_value, KeyValue};
+use opentelemetry_proto::tonic::metrics::v1::{metric, number_data_point, *};
+
+use super::{GREPTIME_COUNT, GREPTIME_TIMESTAMP, GREPTIME_VALUE};
+use crate::error::Result;
+use crate::row_writer::{self, MultiTableData, TableData};
+
+/// the default column count for table writer
+const APPROXIMATE_COLUMN_COUNT: usize = 8;
+
+/// Normalize otlp instrumentation, metric and attribute names
+///
+/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/api.md#instrument-name-syntax>
+/// - since the name are case-insensitive, we transform them to lowercase for
+/// better sql usability
+/// - replace `.` and `-` with `_`
+fn normalize_otlp_name(name: &str) -> String {
+    name.to_lowercase().replace(|c| c == '.' || c == '-', "_")
+}
+
+/// Convert OpenTelemetry metrics to GreptimeDB insert requests
+///
+/// See
+/// <https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/metrics/v1/metrics.proto>
+/// for data structure of OTLP metrics.
+///
+/// Returns `InsertRequests` and total number of rows to ingest
+pub fn to_grpc_insert_requests(
+    request: ExportMetricsServiceRequest,
+) -> Result<(RowInsertRequests, usize)> {
+    let mut table_writer = MultiTableData::default();
+
+    for resource in &request.resource_metrics {
+        let resource_attrs = resource.resource.as_ref().map(|r| &r.attributes);
+        for scope in &resource.scope_metrics {
+            let scope_attrs = scope.scope.as_ref().map(|s| &s.attributes);
+            for metric in &scope.metrics {
+                encode_metrics(&mut table_writer, metric, resource_attrs, scope_attrs)?;
+            }
+        }
+    }
+
+    Ok(table_writer.into_row_insert_requests())
+}
+
+fn encode_metrics(
+    table_writer: &mut MultiTableData,
+    metric: &Metric,
+    resource_attrs: Option<&Vec<KeyValue>>,
+    scope_attrs: Option<&Vec<KeyValue>>,
+) -> Result<()> {
+    let name = &metric.name;
+    // note that we don't store description or unit, we might want to deal with
+    // these fields in the future.
+    if let Some(data) = &metric.data {
+        match data {
+            metric::Data::Gauge(gauge) => {
+                encode_gauge(table_writer, name, gauge, resource_attrs, scope_attrs)?;
+            }
+            metric::Data::Sum(sum) => {
+                encode_sum(table_writer, name, sum, resource_attrs, scope_attrs)?;
+            }
+            metric::Data::Summary(summary) => {
+                encode_summary(table_writer, name, summary, resource_attrs, scope_attrs)?;
+            }
+            metric::Data::Histogram(hist) => {
+                encode_histogram(table_writer, name, hist, resource_attrs, scope_attrs)?;
+            }
+            // TODO(sunng87) leave ExponentialHistogram for next release
+            metric::Data::ExponentialHistogram(_hist) => {}
+        }
+    }
+
+    Ok(())
+}
+
+fn write_attributes(
+    writer: &mut TableData,
+    row: &mut Vec<Value>,
+    attrs: Option<&Vec<KeyValue>>,
+) -> Result<()> {
+    if let Some(attrs) = attrs {
+        let table_tags = attrs.iter().filter_map(|attr| {
+            if let Some(val) = attr.value.as_ref().and_then(|v| v.value.as_ref()) {
+                let key = normalize_otlp_name(&attr.key);
+                match val {
+                    any_value::Value::StringValue(s) => Some((key, s.to_string())),
+                    any_value::Value::IntValue(v) => Some((key, v.to_string())),
+                    any_value::Value::DoubleValue(v) => Some((key, v.to_string())),
+                    _ => None, // TODO(sunng87): allow different type of values
+                }
+            } else {
+                None
+            }
+        });
+
+        row_writer::write_tags(writer, table_tags, row)?;
+    }
+    Ok(())
+}
+
+fn write_timestamp(table: &mut TableData, row: &mut Vec<Value>, time_nano: i64) -> Result<()> {
+    row_writer::write_ts_precision(
+        table,
+        GREPTIME_TIMESTAMP,
+        Some(time_nano),
+        Precision::Nanosecond,
+        row,
+    )
+}
+
+fn write_data_point_value(
+    table: &mut TableData,
+    row: &mut Vec<Value>,
+    field: &str,
+    value: &Option<number_data_point::Value>,
+) -> Result<()> {
+    match value {
+        Some(number_data_point::Value::AsInt(val)) => {
+            // we coerce all values to f64
+            row_writer::write_f64(table, field, *val as f64, row)?;
+        }
+        Some(number_data_point::Value::AsDouble(val)) => {
+            row_writer::write_f64(table, field, *val, row)?;
+        }
+        _ => {}
+    }
+    Ok(())
+}
+
+fn write_tags_and_timestamp(
+    table: &mut TableData,
+    row: &mut Vec<Value>,
+    resource_attrs: Option<&Vec<KeyValue>>,
+    scope_attrs: Option<&Vec<KeyValue>>,
+    data_point_attrs: Option<&Vec<KeyValue>>,
+    timestamp_nanos: i64,
+) -> Result<()> {
+    write_attributes(table, row, resource_attrs)?;
+    write_attributes(table, row, scope_attrs)?;
+    write_attributes(table, row, data_point_attrs)?;
+
+    write_timestamp(table, row, timestamp_nanos)?;
+
+    Ok(())
+}
+
+/// encode this gauge metric
+///
+/// note that there can be multiple data points in the request, it's going to be
+/// stored as multiple rows
+fn encode_gauge(
+    table_writer: &mut MultiTableData,
+    name: &str,
+    gauge: &Gauge,
+    resource_attrs: Option<&Vec<KeyValue>>,
+    scope_attrs: Option<&Vec<KeyValue>>,
+) -> Result<()> {
+    let table = table_writer.get_or_default_table_data(
+        &normalize_otlp_name(name),
+        APPROXIMATE_COLUMN_COUNT,
+        gauge.data_points.len(),
+    );
+
+    for data_point in &gauge.data_points {
+        let mut row = table.alloc_one_row();
+        write_tags_and_timestamp(
+            table,
+            &mut row,
+            resource_attrs,
+            scope_attrs,
+            Some(data_point.attributes.as_ref()),
+            data_point.time_unix_nano as i64,
+        )?;
+
+        write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?;
+        table.add_row(row);
+    }
+
+    Ok(())
+}
+
+/// encode this sum metric
+///
+/// `aggregation_temporality` and `monotonic` are ignored for now
+fn encode_sum(
+    table_writer: &mut MultiTableData,
+    name: &str,
+    sum: &Sum,
+    resource_attrs: Option<&Vec<KeyValue>>,
+    scope_attrs: Option<&Vec<KeyValue>>,
+) -> Result<()> {
+    let table = table_writer.get_or_default_table_data(
+        &normalize_otlp_name(name),
+        APPROXIMATE_COLUMN_COUNT,
+        sum.data_points.len(),
+    );
+
+    for data_point in &sum.data_points {
+        let mut row = table.alloc_one_row();
+        write_tags_and_timestamp(
+            table,
+            &mut row,
+            resource_attrs,
+            scope_attrs,
+            Some(data_point.attributes.as_ref()),
+            data_point.time_unix_nano as i64,
+        )?;
+        write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?;
+        table.add_row(row);
+    }
+
+    Ok(())
+}
+
+const HISTOGRAM_LE_COLUMN: &str = "le";
+
+/// Encode histogram data. This function returns 3 insert requests for 3 tables.
+///
+/// The implementation has been following Prometheus histogram table format:
+///
+/// - A `%metric%_bucket` table including `greptime_le` tag that stores bucket upper
+/// limit, and `greptime_value` for bucket count
+/// - A `%metric%_sum` table storing sum of samples
+/// -  A `%metric%_count` table storing count of samples.
+///
+/// By its Prometheus compatibility, we hope to be able to use prometheus
+/// quantile functions on this table.
+fn encode_histogram(
+    table_writer: &mut MultiTableData,
+    name: &str,
+    hist: &Histogram,
+    resource_attrs: Option<&Vec<KeyValue>>,
+    scope_attrs: Option<&Vec<KeyValue>>,
+) -> Result<()> {
+    let normalized_name = normalize_otlp_name(name);
+
+    let bucket_table_name = format!("{}_bucket", normalized_name);
+    let sum_table_name = format!("{}_sum", normalized_name);
+    let count_table_name = format!("{}_count", normalized_name);
+
+    let data_points_len = hist.data_points.len();
+    // Note that the row and columns number here is approximate
+    let mut bucket_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len * 3);
+    let mut sum_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len);
+    let mut count_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len);
+
+    for data_point in &hist.data_points {
+        let mut accumulated_count = 0;
+        for (idx, count) in data_point.bucket_counts.iter().enumerate() {
+            let mut bucket_row = bucket_table.alloc_one_row();
+            write_tags_and_timestamp(
+                &mut bucket_table,
+                &mut bucket_row,
+                resource_attrs,
+                scope_attrs,
+                Some(data_point.attributes.as_ref()),
+                data_point.time_unix_nano as i64,
+            )?;
+
+            if let Some(upper_bounds) = data_point.explicit_bounds.get(idx) {
+                row_writer::write_tag(
+                    &mut bucket_table,
+                    HISTOGRAM_LE_COLUMN,
+                    upper_bounds,
+                    &mut bucket_row,
+                )?;
+            } else if idx == data_point.explicit_bounds.len() {
+                // The last bucket
+                row_writer::write_tag(
+                    &mut bucket_table,
+                    HISTOGRAM_LE_COLUMN,
+                    f64::INFINITY,
+                    &mut bucket_row,
+                )?;
+            }
+
+            accumulated_count += count;
+            row_writer::write_f64(
+                &mut bucket_table,
+                GREPTIME_VALUE,
+                accumulated_count as f64,
+                &mut bucket_row,
+            )?;
+
+            bucket_table.add_row(bucket_row);
+        }
+
+        if let Some(sum) = data_point.sum {
+            let mut sum_row = sum_table.alloc_one_row();
+            write_tags_and_timestamp(
+                &mut sum_table,
+                &mut sum_row,
+                resource_attrs,
+                scope_attrs,
+                Some(data_point.attributes.as_ref()),
+                data_point.time_unix_nano as i64,
+            )?;
+
+            row_writer::write_f64(&mut sum_table, GREPTIME_VALUE, sum, &mut sum_row)?;
+            sum_table.add_row(sum_row);
+        }
+
+        let mut count_row = count_table.alloc_one_row();
+        write_tags_and_timestamp(
+            &mut count_table,
+            &mut count_row,
+            resource_attrs,
+            scope_attrs,
+            Some(data_point.attributes.as_ref()),
+            data_point.time_unix_nano as i64,
+        )?;
+
+        row_writer::write_f64(
+            &mut count_table,
+            GREPTIME_VALUE,
+            data_point.count as f64,
+            &mut count_row,
+        )?;
+        count_table.add_row(count_row);
+    }
+
+    table_writer.add_table_data(bucket_table_name, bucket_table);
+    table_writer.add_table_data(sum_table_name, sum_table);
+    table_writer.add_table_data(count_table_name, count_table);
+
+    Ok(())
+}
+
+#[allow(dead_code)]
+fn encode_exponential_histogram(_name: &str, _hist: &ExponentialHistogram) -> Result<()> {
+    // TODO(sunng87): implement this using a prometheus compatible way
+    Ok(())
+}
+
+fn encode_summary(
+    table_writer: &mut MultiTableData,
+    name: &str,
+    summary: &Summary,
+    resource_attrs: Option<&Vec<KeyValue>>,
+    scope_attrs: Option<&Vec<KeyValue>>,
+) -> Result<()> {
+    let table = table_writer.get_or_default_table_data(
+        &normalize_otlp_name(name),
+        APPROXIMATE_COLUMN_COUNT,
+        summary.data_points.len(),
+    );
+
+    for data_point in &summary.data_points {
+        let mut row = table.alloc_one_row();
+        write_tags_and_timestamp(
+            table,
+            &mut row,
+            resource_attrs,
+            scope_attrs,
+            Some(data_point.attributes.as_ref()),
+            data_point.time_unix_nano as i64,
+        )?;
+
+        for quantile in &data_point.quantile_values {
+            row_writer::write_f64(
+                table,
+                &format!("greptime_p{:02}", quantile.quantile * 100f64),
+                quantile.value,
+                &mut row,
+            )?;
+        }
+
+        row_writer::write_f64(table, GREPTIME_COUNT, data_point.count as f64, &mut row)?;
+        table.add_row(row);
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use opentelemetry_proto::tonic::common::v1::any_value::Value as Val;
+    use opentelemetry_proto::tonic::common::v1::{AnyValue, KeyValue};
+    use opentelemetry_proto::tonic::metrics::v1::number_data_point::Value;
+    use opentelemetry_proto::tonic::metrics::v1::summary_data_point::ValueAtQuantile;
+    use opentelemetry_proto::tonic::metrics::v1::{HistogramDataPoint, NumberDataPoint};
+
+    use super::*;
+
+    #[test]
+    fn test_normalize_otlp_name() {
+        assert_eq!(normalize_otlp_name("jvm.memory.free"), "jvm_memory_free");
+        assert_eq!(normalize_otlp_name("jvm-memory-free"), "jvm_memory_free");
+        assert_eq!(normalize_otlp_name("jvm_memory_free"), "jvm_memory_free");
+        assert_eq!(normalize_otlp_name("JVM_MEMORY_FREE"), "jvm_memory_free");
+        assert_eq!(normalize_otlp_name("JVM_memory_FREE"), "jvm_memory_free");
+    }
+
+    fn keyvalue(key: &str, value: &str) -> KeyValue {
+        KeyValue {
+            key: key.into(),
+            value: Some(AnyValue {
+                value: Some(Val::StringValue(value.into())),
+            }),
+        }
+    }
+
+    #[test]
+    fn test_encode_gauge() {
+        let mut tables = MultiTableData::default();
+
+        let data_points = vec![
+            NumberDataPoint {
+                attributes: vec![keyvalue("host", "testsevrer")],
+                time_unix_nano: 100,
+                value: Some(Value::AsInt(100)),
+                ..Default::default()
+            },
+            NumberDataPoint {
+                attributes: vec![keyvalue("host", "testserver")],
+                time_unix_nano: 105,
+                value: Some(Value::AsInt(105)),
+                ..Default::default()
+            },
+        ];
+        let gauge = Gauge { data_points };
+        encode_gauge(
+            &mut tables,
+            "datamon",
+            &gauge,
+            Some(&vec![keyvalue("resource", "app")]),
+            Some(&vec![keyvalue("scope", "otel")]),
+        )
+        .unwrap();
+
+        let table = tables.get_or_default_table_data("datamon", 0, 0);
+        assert_eq!(table.num_rows(), 2);
+        assert_eq!(table.num_columns(), 5);
+        assert_eq!(
+            table
+                .columns()
+                .iter()
+                .map(|c| &c.column_name)
+                .collect::<Vec<&String>>(),
+            vec![
+                "resource",
+                "scope",
+                "host",
+                "greptime_timestamp",
+                "greptime_value"
+            ]
+        );
+    }
+
+    #[test]
+    fn test_encode_sum() {
+        let mut tables = MultiTableData::default();
+
+        let data_points = vec![
+            NumberDataPoint {
+                attributes: vec![keyvalue("host", "testserver")],
+                time_unix_nano: 100,
+                value: Some(Value::AsInt(100)),
+                ..Default::default()
+            },
+            NumberDataPoint {
+                attributes: vec![keyvalue("host", "testserver")],
+                time_unix_nano: 105,
+                value: Some(Value::AsInt(0)),
+                ..Default::default()
+            },
+        ];
+        let sum = Sum {
+            data_points,
+            ..Default::default()
+        };
+        encode_sum(
+            &mut tables,
+            "datamon",
+            &sum,
+            Some(&vec![keyvalue("resource", "app")]),
+            Some(&vec![keyvalue("scope", "otel")]),
+        )
+        .unwrap();
+
+        let table = tables.get_or_default_table_data("datamon", 0, 0);
+        assert_eq!(table.num_rows(), 2);
+        assert_eq!(table.num_columns(), 5);
+        assert_eq!(
+            table
+                .columns()
+                .iter()
+                .map(|c| &c.column_name)
+                .collect::<Vec<&String>>(),
+            vec![
+                "resource",
+                "scope",
+                "host",
+                "greptime_timestamp",
+                "greptime_value"
+            ]
+        );
+    }
+
+    #[test]
+    fn test_encode_summary() {
+        let mut tables = MultiTableData::default();
+
+        let data_points = vec![SummaryDataPoint {
+            attributes: vec![keyvalue("host", "testserver")],
+            time_unix_nano: 100,
+            count: 25,
+            sum: 5400.0,
+            quantile_values: vec![
+                ValueAtQuantile {
+                    quantile: 0.90,
+                    value: 1000.0,
+                },
+                ValueAtQuantile {
+                    quantile: 0.95,
+                    value: 3030.0,
+                },
+            ],
+            ..Default::default()
+        }];
+        let summary = Summary { data_points };
+        encode_summary(
+            &mut tables,
+            "datamon",
+            &summary,
+            Some(&vec![keyvalue("resource", "app")]),
+            Some(&vec![keyvalue("scope", "otel")]),
+        )
+        .unwrap();
+
+        let table = tables.get_or_default_table_data("datamon", 0, 0);
+        assert_eq!(table.num_rows(), 1);
+        assert_eq!(table.num_columns(), 7);
+        assert_eq!(
+            table
+                .columns()
+                .iter()
+                .map(|c| &c.column_name)
+                .collect::<Vec<&String>>(),
+            vec![
+                "resource",
+                "scope",
+                "host",
+                "greptime_timestamp",
+                "greptime_p90",
+                "greptime_p95",
+                "greptime_count"
+            ]
+        );
+    }
+
+    #[test]
+    fn test_encode_histogram() {
+        let mut tables = MultiTableData::default();
+
+        let data_points = vec![HistogramDataPoint {
+            attributes: vec![keyvalue("host", "testserver")],
+            time_unix_nano: 100,
+            start_time_unix_nano: 23,
+            count: 25,
+            sum: Some(100.),
+            max: Some(200.),
+            min: Some(0.03),
+            bucket_counts: vec![2, 4, 6, 9, 4],
+            explicit_bounds: vec![0.1, 1., 10., 100.],
+            ..Default::default()
+        }];
+
+        let histogram = Histogram {
+            data_points,
+            aggregation_temporality: AggregationTemporality::Delta.into(),
+        };
+        encode_histogram(
+            &mut tables,
+            "histo",
+            &histogram,
+            Some(&vec![keyvalue("resource", "app")]),
+            Some(&vec![keyvalue("scope", "otel")]),
+        )
+        .unwrap();
+
+        assert_eq!(3, tables.num_tables());
+
+        // bucket table
+        let bucket_table = tables.get_or_default_table_data("histo_bucket", 0, 0);
+        assert_eq!(bucket_table.num_rows(), 5);
+        assert_eq!(bucket_table.num_columns(), 6);
+        assert_eq!(
+            bucket_table
+                .columns()
+                .iter()
+                .map(|c| &c.column_name)
+                .collect::<Vec<&String>>(),
+            vec![
+                "resource",
+                "scope",
+                "host",
+                "greptime_timestamp",
+                "le",
+                "greptime_value",
+            ]
+        );
+
+        let sum_table = tables.get_or_default_table_data("histo_sum", 0, 0);
+        assert_eq!(sum_table.num_rows(), 1);
+        assert_eq!(sum_table.num_columns(), 5);
+        assert_eq!(
+            sum_table
+                .columns()
+                .iter()
+                .map(|c| &c.column_name)
+                .collect::<Vec<&String>>(),
+            vec![
+                "resource",
+                "scope",
+                "host",
+                "greptime_timestamp",
+                "greptime_value",
+            ]
+        );
+
+        let count_table = tables.get_or_default_table_data("histo_count", 0, 0);
+        assert_eq!(count_table.num_rows(), 1);
+        assert_eq!(count_table.num_columns(), 5);
+        assert_eq!(
+            count_table
+                .columns()
+                .iter()
+                .map(|c| &c.column_name)
+                .collect::<Vec<&String>>(),
+            vec![
+                "resource",
+                "scope",
+                "host",
+                "greptime_timestamp",
+                "greptime_value",
+            ]
+        );
+    }
+}
--- a/src/servers/src/otlp/plugin.rs
+++ b/src/servers/src/otlp/plugin.rs
@@ -0,0 +1,28 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
+
+use super::trace::TraceSpans;
+
+/// Transformer helps to transform ExportTraceServiceRequest based on logic, like:
+///   - uplift some fields from Attributes (Map type) to column
+pub trait TraceParser: Send + Sync {
+    fn parse(&self, request: ExportTraceServiceRequest) -> TraceSpans;
+    fn table_name(&self) -> String;
+}
+
+pub type TraceParserRef = Arc<dyn TraceParser>;
--- a/src/servers/src/otlp/trace.rs
+++ b/src/servers/src/otlp/trace.rs
@@ -0,0 +1,411 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use api::v1::value::ValueData;
+use api::v1::{ColumnDataType, RowInsertRequests};
+use common_grpc::writer::Precision;
+use common_time::time::Time;
+use itertools::Itertools;
+use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
+use opentelemetry_proto::tonic::common::v1::any_value::Value as OtlpValue;
+use opentelemetry_proto::tonic::common::v1::{
+    AnyValue, ArrayValue, InstrumentationScope, KeyValue, KeyValueList,
+};
+use opentelemetry_proto::tonic::trace::v1::span::{Event, Link};
+use opentelemetry_proto::tonic::trace::v1::{Span, Status};
+use serde_json::json;
+
+use super::{GREPTIME_TIMESTAMP, GREPTIME_VALUE};
+use crate::error::Result;
+use crate::row_writer::{self, MultiTableData, TableData};
+
+const APPROXIMATE_COLUMN_COUNT: usize = 24;
+pub const TRACE_TABLE_NAME: &str = "traces_preview_v01";
+
+#[derive(Debug, Clone)]
+#[allow(dead_code)]
+pub struct TraceSpan {
+    // the following are tags
+    pub trace_id: String,
+    pub span_id: String,
+    pub parent_span_id: String,
+
+    // the following are fields
+    pub resource_attributes: String, // TODO(yuanbohan): Map in the future
+    pub scope_name: String,
+    pub scope_version: String,
+    pub scope_attributes: String, // TODO(yuanbohan): Map in the future
+    pub trace_state: String,
+    pub span_name: String,
+    pub span_kind: String,
+    pub span_status_code: String,
+    pub span_status_message: String,
+    pub span_attributes: String,  // TODO(yuanbohan): Map in the future
+    pub span_events: String,      // TODO(yuanbohan): List in the future
+    pub span_links: String,       // TODO(yuanbohan): List in the future
+    pub start_in_nanosecond: u64, // this is also the Timestamp Index
+    pub end_in_nanosecond: u64,
+
+    pub uplifted_fields: Vec<(String, ColumnDataType, ValueData)>,
+}
+
+pub type TraceSpans = Vec<TraceSpan>;
+
+/// Convert SpanTraces to GreptimeDB row insert requests.
+/// Returns `InsertRequests` and total number of rows to ingest
+pub fn to_grpc_insert_requests(
+    table_name: String,
+    spans: TraceSpans,
+) -> Result<(RowInsertRequests, usize)> {
+    let mut multi_table_writer = MultiTableData::default();
+    let one_table_writer = multi_table_writer.get_or_default_table_data(
+        table_name,
+        APPROXIMATE_COLUMN_COUNT,
+        spans.len(),
+    );
+
+    for span in spans {
+        write_span_to_row(one_table_writer, span)?;
+    }
+
+    Ok(multi_table_writer.into_row_insert_requests())
+}
+
+pub fn write_span_to_row(writer: &mut TableData, span: TraceSpan) -> Result<()> {
+    let mut row = writer.alloc_one_row();
+    {
+        // tags
+        let iter = vec![
+            ("trace_id", span.trace_id),
+            ("span_id", span.span_id),
+            ("parent_span_id", span.parent_span_id),
+        ]
+        .into_iter()
+        .map(|(col, val)| (col.to_string(), val));
+        row_writer::write_tags(writer, iter, &mut row)?;
+    }
+    {
+        // fields
+        let str_fields_iter = vec![
+            ("resource_attributes", span.resource_attributes),
+            ("scope_name", span.scope_name),
+            ("scope_version", span.scope_version),
+            ("scope_attributes", span.scope_attributes),
+            ("trace_state", span.trace_state),
+            ("span_name", span.span_name),
+            ("span_kind", span.span_kind),
+            ("span_status_code", span.span_status_code),
+            ("span_status_message", span.span_status_message),
+            ("span_attributes", span.span_attributes),
+            ("span_events", span.span_events),
+            ("span_links", span.span_links),
+        ]
+        .into_iter()
+        .map(|(col, val)| {
+            (
+                col.into(),
+                ColumnDataType::String,
+                ValueData::StringValue(val),
+            )
+        });
+
+        let time_fields_iter = vec![
+            ("start", span.start_in_nanosecond),
+            ("end", span.end_in_nanosecond),
+        ]
+        .into_iter()
+        .map(|(col, val)| {
+            (
+                col.into(),
+                ColumnDataType::TimestampNanosecond,
+                ValueData::TimestampNanosecondValue(val as i64),
+            )
+        });
+
+        row_writer::write_fields(writer, str_fields_iter, &mut row)?;
+        row_writer::write_fields(writer, time_fields_iter, &mut row)?;
+        row_writer::write_fields(writer, span.uplifted_fields.into_iter(), &mut row)?;
+    }
+
+    row_writer::write_f64(
+        writer,
+        GREPTIME_VALUE,
+        (span.end_in_nanosecond - span.start_in_nanosecond) as f64 / 1_000_000.0, // duration in millisecond
+        &mut row,
+    )?;
+    row_writer::write_ts_precision(
+        writer,
+        GREPTIME_TIMESTAMP,
+        Some(span.start_in_nanosecond as i64),
+        Precision::Nanosecond,
+        &mut row,
+    )?;
+
+    writer.add_row(row);
+
+    Ok(())
+}
+
+pub fn parse_span(
+    resource_attrs: &[KeyValue],
+    scope: &InstrumentationScope,
+    span: Span,
+) -> TraceSpan {
+    let (span_status_code, span_status_message) = status_to_string(&span.status);
+    let span_kind = span.kind().as_str_name().into();
+    TraceSpan {
+        trace_id: bytes_to_hex_string(&span.trace_id),
+        span_id: bytes_to_hex_string(&span.span_id),
+        parent_span_id: bytes_to_hex_string(&span.parent_span_id),
+
+        resource_attributes: vec_kv_to_string(resource_attrs),
+        trace_state: span.trace_state,
+
+        scope_name: scope.name.clone(),
+        scope_version: scope.version.clone(),
+        scope_attributes: vec_kv_to_string(&scope.attributes),
+
+        span_name: span.name,
+        span_kind,
+        span_status_code,
+        span_status_message,
+        span_attributes: vec_kv_to_string(&span.attributes),
+        span_events: events_to_string(&span.events),
+        span_links: links_to_string(&span.links),
+
+        start_in_nanosecond: span.start_time_unix_nano,
+        end_in_nanosecond: span.end_time_unix_nano,
+
+        uplifted_fields: vec![],
+    }
+}
+
+/// Convert OpenTelemetry traces to SpanTraces
+///
+/// See
+/// <https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto>
+/// for data structure of OTLP traces.
+pub fn parse(request: ExportTraceServiceRequest) -> TraceSpans {
+    let mut spans = vec![];
+    for resource_spans in request.resource_spans {
+        let resource_attrs = resource_spans
+            .resource
+            .map(|r| r.attributes)
+            .unwrap_or_default();
+        for scope_spans in resource_spans.scope_spans {
+            let scope = scope_spans.scope.unwrap_or_default();
+            for span in scope_spans.spans {
+                spans.push(parse_span(&resource_attrs, &scope, span));
+            }
+        }
+    }
+    spans
+}
+
+pub fn bytes_to_hex_string(bs: &[u8]) -> String {
+    bs.iter().map(|b| format!("{:02x}", b)).join("")
+}
+
+pub fn arr_vals_to_string(arr: &ArrayValue) -> String {
+    let vs: Vec<String> = arr
+        .values
+        .iter()
+        .filter_map(|val| any_value_to_string(val.clone()))
+        .collect();
+
+    serde_json::to_string(&vs).unwrap_or_else(|_| "[]".into())
+}
+
+pub fn vec_kv_to_string(vec: &[KeyValue]) -> String {
+    let vs: HashMap<String, String> = vec
+        .iter()
+        .map(|kv| {
+            let val = kv
+                .value
+                .clone()
+                .and_then(any_value_to_string)
+                .unwrap_or_default();
+            (kv.key.clone(), val)
+        })
+        .collect();
+
+    serde_json::to_string(&vs).unwrap_or_else(|_| "{}".into())
+}
+
+pub fn kvlist_to_string(kvlist: &KeyValueList) -> String {
+    vec_kv_to_string(&kvlist.values)
+}
+
+pub fn any_value_to_string(val: AnyValue) -> Option<String> {
+    val.value.map(|value| match value {
+        OtlpValue::StringValue(s) => s,
+        OtlpValue::BoolValue(b) => b.to_string(),
+        OtlpValue::IntValue(i) => i.to_string(),
+        OtlpValue::DoubleValue(d) => d.to_string(),
+        OtlpValue::ArrayValue(arr) => arr_vals_to_string(&arr),
+        OtlpValue::KvlistValue(kv) => kvlist_to_string(&kv),
+        OtlpValue::BytesValue(bs) => bytes_to_hex_string(&bs),
+    })
+}
+
+pub fn event_to_string(event: &Event) -> String {
+    json!({
+    "name": event.name,
+    "time": Time::new_nanosecond(event.time_unix_nano as i64).to_iso8601_string(),
+    "attrs": vec_kv_to_string(&event.attributes),
+    })
+    .to_string()
+}
+
+pub fn events_to_string(events: &[Event]) -> String {
+    let v: Vec<String> = events.iter().map(event_to_string).collect();
+    serde_json::to_string(&v).unwrap_or_else(|_| "[]".into())
+}
+
+pub fn link_to_string(link: &Link) -> String {
+    json!({
+    "trace_id": link.trace_id,
+    "span_id": link.span_id,
+    "trace_state": link.trace_state,
+    "attributes": vec_kv_to_string(&link.attributes),
+    })
+    .to_string()
+}
+
+pub fn links_to_string(links: &[Link]) -> String {
+    let v: Vec<String> = links.iter().map(link_to_string).collect();
+    serde_json::to_string(&v).unwrap_or_else(|_| "[]".into())
+}
+
+pub fn status_to_string(status: &Option<Status>) -> (String, String) {
+    match status {
+        Some(status) => (status.code().as_str_name().into(), status.message.clone()),
+        None => ("".into(), "".into()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_time::time::Time;
+    use opentelemetry_proto::tonic::common::v1::{
+        any_value, AnyValue, ArrayValue, KeyValue, KeyValueList,
+    };
+    use opentelemetry_proto::tonic::trace::v1::span::Event;
+    use opentelemetry_proto::tonic::trace::v1::Status;
+    use serde_json::json;
+
+    use crate::otlp::trace::{
+        arr_vals_to_string, bytes_to_hex_string, event_to_string, kvlist_to_string,
+        status_to_string, vec_kv_to_string,
+    };
+
+    #[test]
+    fn test_bytes_to_hex_string() {
+        assert_eq!(
+            "24fe79948641b110a29bc27859307e8d",
+            bytes_to_hex_string(&[
+                36, 254, 121, 148, 134, 65, 177, 16, 162, 155, 194, 120, 89, 48, 126, 141,
+            ])
+        );
+
+        assert_eq!(
+            "baffeedd7b8debc0",
+            bytes_to_hex_string(&[186, 255, 238, 221, 123, 141, 235, 192,])
+        );
+    }
+
+    #[test]
+    fn test_arr_vals_to_string() {
+        assert_eq!("[]", arr_vals_to_string(&ArrayValue { values: vec![] }));
+
+        let arr = ArrayValue {
+            values: vec![
+                AnyValue {
+                    value: Some(any_value::Value::StringValue("string_value".into())),
+                },
+                AnyValue {
+                    value: Some(any_value::Value::BoolValue(true)),
+                },
+                AnyValue {
+                    value: Some(any_value::Value::IntValue(1)),
+                },
+                AnyValue {
+                    value: Some(any_value::Value::DoubleValue(1.2)),
+                },
+            ],
+        };
+        let expect = json!(["string_value", "true", "1", "1.2"]).to_string();
+        assert_eq!(expect, arr_vals_to_string(&arr));
+    }
+
+    #[test]
+    fn test_kv_list_to_string() {
+        let kvlist = KeyValueList {
+            values: vec![KeyValue {
+                key: "str_key".into(),
+                value: Some(AnyValue {
+                    value: Some(any_value::Value::StringValue("val1".into())),
+                }),
+            }],
+        };
+        let expect = json!({
+            "str_key": "val1",
+        })
+        .to_string();
+        assert_eq!(expect, kvlist_to_string(&kvlist))
+    }
+
+    #[test]
+    fn test_event_to_string() {
+        let attributes = vec![KeyValue {
+            key: "str_key".into(),
+            value: Some(AnyValue {
+                value: Some(any_value::Value::StringValue("val1".into())),
+            }),
+        }];
+        let event = Event {
+            time_unix_nano: 1697620662450128000_u64,
+            name: "event_name".into(),
+            attributes,
+            dropped_attributes_count: 0,
+        };
+        let event_string = event_to_string(&event);
+        let expect = json!({
+            "name": event.name,
+            "time": Time::new_nanosecond(event.time_unix_nano as i64).to_iso8601_string(),
+            "attrs": vec_kv_to_string(&event.attributes),
+        });
+
+        assert_eq!(
+            expect,
+            serde_json::from_str::<serde_json::value::Value>(event_string.as_str()).unwrap()
+        );
+    }
+
+    #[test]
+    fn test_status_to_string() {
+        let message = String::from("status message");
+        let status = Status {
+            code: 1,
+            message: message.clone(),
+        };
+
+        assert_eq!(
+            ("STATUS_CODE_OK".into(), message),
+            status_to_string(&Some(status)),
+        );
+    }
+}
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -34,6 +34,9 @@ use common_query::Output;
 use opentelemetry_proto::tonic::collector::metrics::v1::{
    ExportMetricsServiceRequest, ExportMetricsServiceResponse,
 };
+use opentelemetry_proto::tonic::collector::trace::v1::{
+    ExportTraceServiceRequest, ExportTraceServiceResponse,
+};
 use session::context::QueryContextRef;

 use crate::error::Result;
@@ -74,7 +77,7 @@ pub trait InfluxdbLineProtocolHandler {
 pub trait OpentsdbProtocolHandler {
    /// A successful request will not return a response.
    /// Only on error will the socket return a line of data.
-    async fn exec(&self, data_point: &DataPoint, ctx: QueryContextRef) -> Result<()>;
+    async fn exec(&self, data_points: Vec<DataPoint>, ctx: QueryContextRef) -> Result<usize>;
 }

 pub struct PromStoreResponse {
@@ -101,4 +104,11 @@ pub trait OpenTelemetryProtocolHandler {
        request: ExportMetricsServiceRequest,
        ctx: QueryContextRef,
    ) -> Result<ExportMetricsServiceResponse>;
+
+    /// Handling opentelemetry traces request
+    async fn traces(
+        &self,
+        request: ExportTraceServiceRequest,
+        ctx: QueryContextRef,
+    ) -> Result<ExportTraceServiceResponse>;
 }
--- a/src/servers/tests/http/opentsdb_test.rs
+++ b/src/servers/tests/http/opentsdb_test.rs
@@ -51,7 +51,8 @@ impl GrpcQueryHandler for DummyInstance {

 #[async_trait]
 impl OpentsdbProtocolHandler for DummyInstance {
-    async fn exec(&self, data_point: &DataPoint, _ctx: QueryContextRef) -> Result<()> {
+    async fn exec(&self, data_points: Vec<DataPoint>, _ctx: QueryContextRef) -> Result<usize> {
+        let data_point = data_points.first().unwrap();
        if data_point.metric() == "should_failed" {
            return error::InternalSnafu {
                err_msg: "expected",
@@ -59,7 +60,7 @@ impl OpentsdbProtocolHandler for DummyInstance {
            .fail();
        }
        let _ = self.tx.send(data_point.metric().to_string()).await;
-        Ok(())
+        Ok(data_points.len())
    }
 }

@@ -163,19 +164,13 @@ async fn test_opentsdb_put() {
        .send()
        .await;
    assert_eq!(result.status(), 500);
-    assert_eq!(
-        result.text().await,
-        "{\"error\":\"Internal error: Internal error: expected\"}"
-    );
+    assert_eq!(result.text().await, "{\"error\":\"Internal error: 1003\"}");

    let mut metrics = vec![];
    while let Ok(s) = rx.try_recv() {
        metrics.push(s);
    }
-    assert_eq!(
-        metrics,
-        vec!["m1".to_string(), "m2".to_string(), "m3".to_string()]
-    );
+    assert_eq!(metrics, vec!["m1".to_string(), "m2".to_string()]);
 }

 #[tokio::test]
@@ -208,7 +203,7 @@ async fn test_opentsdb_debug_put() {
        .send()
        .await;
    assert_eq!(result.status(), 200);
-    assert_eq!(result.text().await, "{\"success\":0,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: expected\"}]}");
+    assert_eq!(result.text().await, "{\"success\":0,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: 1003\"}]}");

    // multiple data point summary debug put
    let result = client
@@ -233,7 +228,7 @@ async fn test_opentsdb_debug_put() {
        .send()
        .await;
    assert_eq!(result.status(), 200);
-    assert_eq!(result.text().await, "{\"success\":1,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: expected\"}]}");
+    assert_eq!(result.text().await, "{\"success\":1,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: 1003\"}]}");

    let mut metrics = vec![];
    while let Ok(s) = rx.try_recv() {
--- a/src/servers/tests/opentsdb.rs
+++ b/src/servers/tests/opentsdb.rs
@@ -37,8 +37,8 @@ struct DummyOpentsdbInstance {

 #[async_trait]
 impl OpentsdbProtocolHandler for DummyOpentsdbInstance {
-    async fn exec(&self, data_point: &DataPoint, _ctx: QueryContextRef) -> Result<()> {
-        let metric = data_point.metric();
+    async fn exec(&self, data_points: Vec<DataPoint>, _ctx: QueryContextRef) -> Result<usize> {
+        let metric = data_points.first().unwrap().metric();
        if metric == "should_failed" {
            return server_error::InternalSnafu {
                err_msg: "expected",
@@ -47,7 +47,7 @@ impl OpentsdbProtocolHandler for DummyOpentsdbInstance {
        }
        let i = metric.parse::<i32>().unwrap();
        let _ = self.tx.send(i * i).await;
-        Ok(())
+        Ok(data_points.len())
    }
 }

--- a/src/storage/src/chunk.rs
+++ b/src/storage/src/chunk.rs
@@ -245,11 +245,7 @@ impl ChunkReaderBuilder {
            reader_builder = reader_builder.push_batch_iter(iter);
        }

-        let predicate = Predicate::try_new(
-            self.filters.clone(),
-            self.schema.store_schema().schema().clone(),
-        )
-        .context(error::BuildPredicateSnafu)?;
+        let predicate = Predicate::new(self.filters.clone());

        let read_opts = ReadOptions {
            batch_size: self.iter_ctx.batch_size,
--- a/src/storage/src/sst/parquet.rs
+++ b/src/storage/src/sst/parquet.rs
@@ -277,7 +277,10 @@ impl ParquetReader {

        let pruned_row_groups = self
            .predicate
-            .prune_row_groups(builder.metadata().row_groups())
+            .prune_row_groups(
+                builder.metadata().row_groups(),
+                store_schema.schema().clone(),
+            )
            .into_iter()
            .enumerate()
            .filter_map(|(idx, valid)| if valid { Some(idx) } else { None })
@@ -549,12 +552,11 @@ mod tests {
        let operator = create_object_store(dir.path().to_str().unwrap());

        let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
-        let user_schema = projected_schema.projected_user_schema().clone();
        let reader = ParquetReader::new(
            sst_file_handle,
            operator,
            projected_schema,
-            Predicate::empty(user_schema),
+            Predicate::empty(),
            TimestampRange::min_to_max(),
        );

@@ -636,12 +638,11 @@ mod tests {
        let operator = create_object_store(dir.path().to_str().unwrap());

        let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
-        let user_schema = projected_schema.projected_user_schema().clone();
        let reader = ParquetReader::new(
            file_handle,
            operator,
            projected_schema,
-            Predicate::empty(user_schema),
+            Predicate::empty(),
            TimestampRange::min_to_max(),
        );

@@ -665,14 +666,8 @@ mod tests {
        range: TimestampRange,
        expect: Vec<i64>,
    ) {
-        let store_schema = schema.schema_to_read().clone();
-        let reader = ParquetReader::new(
-            file_handle,
-            object_store,
-            schema,
-            Predicate::empty(store_schema.schema().clone()),
-            range,
-        );
+        let reader =
+            ParquetReader::new(file_handle, object_store, schema, Predicate::empty(), range);
        let mut stream = reader.chunk_stream().await.unwrap();
        let result = stream.next_batch().await;

--- a/src/storage/src/sst/pruning.rs
+++ b/src/storage/src/sst/pruning.rs
@@ -29,9 +29,11 @@ use datatypes::prelude::ConcreteDataType;
 use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter};
 use parquet::arrow::ProjectionMask;
 use parquet::schema::types::SchemaDescriptor;
+use snafu::ResultExt;
 use table::predicate::Predicate;

 use crate::error;
+use crate::error::BuildPredicateSnafu;
 use crate::schema::StoreSchema;

 /// Builds row filters according to predicates.
@@ -80,7 +82,11 @@ pub(crate) fn build_row_filter(
        Box::new(PlainTimestampRowFilter::new(time_range, ts_col_projection)) as _
    };
    let mut predicates = vec![time_range_row_filter];
-    if let Ok(datafusion_filters) = predicate_to_row_filter(predicate, projection_mask) {
+    if let Ok(datafusion_filters) = predicate_to_row_filter(
+        predicate,
+        projection_mask,
+        store_schema.schema().arrow_schema(),
+    ) {
        predicates.extend(datafusion_filters);
    }
    let filter = RowFilter::new(predicates);
@@ -90,9 +96,13 @@ pub(crate) fn build_row_filter(
 fn predicate_to_row_filter(
    predicate: &Predicate,
    projection_mask: ProjectionMask,
+    schema: &arrow::datatypes::SchemaRef,
 ) -> error::Result<Vec<Box<dyn ArrowPredicate>>> {
-    let mut datafusion_predicates = Vec::with_capacity(predicate.exprs().len());
-    for expr in predicate.exprs() {
+    let physical_exprs = predicate
+        .to_physical_exprs(schema)
+        .context(BuildPredicateSnafu)?;
+    let mut datafusion_predicates = Vec::with_capacity(physical_exprs.len());
+    for expr in &physical_exprs {
        datafusion_predicates.push(Box::new(DatafusionArrowPredicate {
            projection_mask: projection_mask.clone(),
            physical_expr: expr.clone(),
--- a/src/table/src/engine.rs
+++ b/src/table/src/engine.rs
@@ -35,6 +35,8 @@ pub struct TableReference<'a> {
    pub table: &'a str,
 }

+pub type OwnedTableReference = TableReference<'static>;
+
 // TODO(LFC): Find a better place for `TableReference`,
 // so that we can reuse the default catalog and schema consts.
 // Could be done together with issue #559.
--- a/src/table/src/predicate.rs
+++ b/src/table/src/predicate.rs
@@ -27,6 +27,7 @@ use datafusion_expr::expr::InList;
 use datafusion_expr::{Between, BinaryExpr, ColumnarValue, Operator};
 use datafusion_physical_expr::execution_props::ExecutionProps;
 use datafusion_physical_expr::{create_physical_expr, PhysicalExpr};
+use datatypes::arrow;
 use datatypes::arrow::array::BooleanArray;
 use datatypes::schema::SchemaRef;
 use datatypes::value::scalar_value_to_timestamp;
@@ -39,19 +40,24 @@ mod stats;

 #[derive(Clone)]
 pub struct Predicate {
-    /// The schema of the table that the expressions being applied.
-    schema: SchemaRef,
-    /// Physical expressions of this predicate.
-    exprs: Vec<Arc<dyn PhysicalExpr>>,
+    /// logical exprs
+    exprs: Vec<Expr>,
 }

 impl Predicate {
    /// Creates a new `Predicate` by converting logical exprs to physical exprs that can be
    /// evaluated against record batches.
    /// Returns error when failed to convert exprs.
-    pub fn try_new(exprs: Vec<Expr>, schema: SchemaRef) -> error::Result<Self> {
-        let arrow_schema = schema.arrow_schema();
-        let df_schema = arrow_schema
+    pub fn new(exprs: Vec<Expr>) -> Self {
+        Self { exprs }
+    }
+
+    /// Builds physical exprs according to provided schema.
+    pub fn to_physical_exprs(
+        &self,
+        schema: &arrow::datatypes::SchemaRef,
+    ) -> error::Result<Vec<Arc<dyn PhysicalExpr>>> {
+        let df_schema = schema
            .clone()
            .to_dfschema_ref()
            .context(error::DatafusionSnafu)?;
@@ -61,47 +67,38 @@ impl Predicate {
        // registering variables.
        let execution_props = &ExecutionProps::new();

-        let physical_exprs = exprs
+        self.exprs
            .iter()
            .map(|expr| {
-                create_physical_expr(
-                    expr.df_expr(),
-                    df_schema.as_ref(),
-                    arrow_schema.as_ref(),
-                    execution_props,
-                )
+                create_physical_expr(expr.df_expr(), df_schema.as_ref(), schema, execution_props)
            })
            .collect::<Result<_, _>>()
-            .context(error::DatafusionSnafu)?;
-
-        Ok(Self {
-            schema,
-            exprs: physical_exprs,
-        })
-    }
-
-    #[inline]
-    pub fn exprs(&self) -> &[Arc<dyn PhysicalExpr>] {
-        &self.exprs
+            .context(error::DatafusionSnafu)
    }

    /// Builds an empty predicate from given schema.
-    pub fn empty(schema: SchemaRef) -> Self {
-        Self {
-            schema,
-            exprs: vec![],
-        }
+    pub fn empty() -> Self {
+        Self { exprs: vec![] }
    }

    /// Evaluates the predicate against row group metadata.
    /// Returns a vector of boolean values, among which `false` means the row group can be skipped.
-    pub fn prune_row_groups(&self, row_groups: &[RowGroupMetaData]) -> Vec<bool> {
+    pub fn prune_row_groups(
+        &self,
+        row_groups: &[RowGroupMetaData],
+        schema: SchemaRef,
+    ) -> Vec<bool> {
        let mut res = vec![true; row_groups.len()];
-        let arrow_schema = self.schema.arrow_schema();
-        for expr in &self.exprs {
+
+        let Ok(physical_exprs) = self.to_physical_exprs(schema.arrow_schema()) else {
+            return res;
+        };
+
+        let arrow_schema = schema.arrow_schema();
+        for expr in &physical_exprs {
            match PruningPredicate::try_new(expr.clone(), arrow_schema.clone()) {
                Ok(p) => {
-                    let stat = RowGroupPruningStatistics::new(row_groups, &self.schema);
+                    let stat = RowGroupPruningStatistics::new(row_groups, &schema);
                    match p.prune(&stat) {
                        Ok(r) => {
                            for (curr_val, res) in r.into_iter().zip(res.iter_mut()) {
@@ -123,7 +120,9 @@ impl Predicate {

    /// Prunes primary keys
    pub fn prune_primary_key(&self, primary_key: &RecordBatch) -> error::Result<bool> {
-        for expr in &self.exprs {
+        let pk_schema = primary_key.schema();
+        let physical_exprs = self.to_physical_exprs(&pk_schema)?;
+        for expr in &physical_exprs {
            // evaluate every filter against primary key
            let Ok(eva) = expr.evaluate(primary_key) else {
                continue;
@@ -156,11 +155,22 @@ impl Predicate {

    /// Evaluates the predicate against the `stats`.
    /// Returns a vector of boolean values, among which `false` means the row group can be skipped.
-    pub fn prune_with_stats<S: PruningStatistics>(&self, stats: &S) -> Vec<bool> {
+    pub fn prune_with_stats<S: PruningStatistics>(
+        &self,
+        stats: &S,
+        schema: &arrow::datatypes::SchemaRef,
+    ) -> Vec<bool> {
        let mut res = vec![true; stats.num_containers()];
-        let arrow_schema = self.schema.arrow_schema();
-        for expr in &self.exprs {
-            match PruningPredicate::try_new(expr.clone(), arrow_schema.clone()) {
+        let physical_exprs = match self.to_physical_exprs(schema) {
+            Ok(expr) => expr,
+            Err(e) => {
+                warn!(e; "Failed to build physical expr from predicates: {:?}", &self.exprs);
+                return res;
+            }
+        };
+
+        for expr in &physical_exprs {
+            match PruningPredicate::try_new(expr.clone(), schema.clone()) {
                Ok(p) => match p.prune(stats) {
                    Ok(r) => {
                        for (curr_val, res) in r.into_iter().zip(res.iter_mut()) {
@@ -641,7 +651,7 @@ mod tests {
        let dir = create_temp_dir("prune_parquet");
        let (path, schema) = gen_test_parquet_file(&dir, array_cnt).await;
        let schema = Arc::new(datatypes::schema::Schema::try_from(schema).unwrap());
-        let arrow_predicate = Predicate::try_new(filters, schema.clone()).unwrap();
+        let arrow_predicate = Predicate::new(filters);
        let builder = ParquetRecordBatchStreamBuilder::new(
            tokio::fs::OpenOptions::new()
                .read(true)
@@ -653,7 +663,7 @@ mod tests {
        .unwrap();
        let metadata = builder.metadata().clone();
        let row_groups = metadata.row_groups();
-        let res = arrow_predicate.prune_row_groups(row_groups);
+        let res = arrow_predicate.prune_row_groups(row_groups, schema);
        assert_eq!(expect, res);
    }

--- a/tests-integration/src/opentsdb.rs
+++ b/tests-integration/src/opentsdb.rs
@@ -46,6 +46,8 @@ mod tests {

    async fn test_exec(instance: &Arc<Instance>) {
        let ctx = QueryContext::arc();
+
+        // should create new table "my_metric_1" directly
        let data_point1 = DataPoint::new(
            "my_metric_1".to_string(),
            1000,
@@ -55,9 +57,8 @@ mod tests {
                ("tagk2".to_string(), "tagv2".to_string()),
            ],
        );
-        // should create new table "my_metric_1" directly
-        instance.exec(&data_point1, ctx.clone()).await.unwrap();

+        // should create new column "tagk3" directly
        let data_point2 = DataPoint::new(
            "my_metric_1".to_string(),
            2000,
@@ -67,12 +68,12 @@ mod tests {
                ("tagk3".to_string(), "tagv3".to_string()),
            ],
        );
-        // should create new column "tagk3" directly
-        instance.exec(&data_point2, ctx.clone()).await.unwrap();

-        let data_point3 = DataPoint::new("my_metric_1".to_string(), 3000, 3.0, vec![]);
        // should handle null tags properly
-        instance.exec(&data_point3, ctx.clone()).await.unwrap();
+        let data_point3 = DataPoint::new("my_metric_1".to_string(), 3000, 3.0, vec![]);
+
+        let data_points = vec![data_point1, data_point2, data_point3];
+        instance.exec(data_points, ctx.clone()).await.unwrap();

        let output = instance
            .do_query(
@@ -87,13 +88,13 @@ mod tests {
                let recordbatches = RecordBatches::try_collect(stream).await.unwrap();
                let pretty_print = recordbatches.pretty_print().unwrap();
                let expected = vec![
-                    "+---------------------+----------------+-------+-------+-------+",
-                    "| greptime_timestamp  | greptime_value | tagk1 | tagk2 | tagk3 |",
-                    "+---------------------+----------------+-------+-------+-------+",
-                    "| 1970-01-01T00:00:01 | 1.0            | tagv1 | tagv2 |       |",
-                    "| 1970-01-01T00:00:02 | 2.0            |       | tagv2 | tagv3 |",
-                    "| 1970-01-01T00:00:03 | 3.0            |       |       |       |",
-                    "+---------------------+----------------+-------+-------+-------+",
+                    "+-------+-------+----------------+---------------------+-------+",
+                    "| tagk1 | tagk2 | greptime_value | greptime_timestamp  | tagk3 |",
+                    "+-------+-------+----------------+---------------------+-------+",
+                    "| tagv1 | tagv2 | 1.0            | 1970-01-01T00:00:01 |       |",
+                    "|       | tagv2 | 2.0            | 1970-01-01T00:00:02 | tagv3 |",
+                    "|       |       | 3.0            | 1970-01-01T00:00:03 |       |",
+                    "+-------+-------+----------------+---------------------+-------+",
                ]
                .into_iter()
                .join("\n");
--- a/tests-integration/src/standalone.rs
+++ b/tests-integration/src/standalone.rs
@@ -92,7 +92,7 @@ impl GreptimeDbStandaloneBuilder {
            .init()
            .await
            .unwrap();
-
+        procedure_manager.start().await.unwrap();
        let instance = Instance::try_new_standalone(
            kv_store,
            procedure_manager,
--- a/tests/cases/distributed/optimizer/filter_push_down.result
+++ b/tests/cases/distributed/optimizer/filter_push_down.result
@@ -237,7 +237,7 @@ SELECT i FROM (SELECT * FROM integers i1 UNION SELECT * FROM integers i2) a WHER
 -- SELECT * FROM (SELECT i1.i AS a, i2.i AS b, row_number() OVER (ORDER BY i1.i, i2.i) FROM integers i1, integers i2 WHERE i1.i IS NOT NULL AND i2.i IS NOT NULL) a1 WHERE a=b ORDER BY 1;
 SELECT * FROM (SELECT 0=1 AS cond FROM integers i1, integers i2) a1 WHERE cond ORDER BY 1;

-Error: 1003(Internal), Invalid argument error: must either specify a row count or at least one column
+Error: 3001(EngineExecuteQuery), Invalid argument error: must either specify a row count or at least one column

 SELECT * FROM (SELECT 0=1 AS cond FROM integers i1, integers i2 GROUP BY 1) a1 WHERE cond ORDER BY 1;

--- a/tests/cases/standalone/common/order/order_by_exceptions.result
+++ b/tests/cases/standalone/common/order/order_by_exceptions.result
@@ -13,7 +13,7 @@ Error: 3000(PlanQuery), Error during planning: Order by column out of bounds, sp
 -- Not work in greptimedb
 SELECT a FROM test ORDER BY 'hello', a;

-Error: 1003(Internal), Error during planning: Sort operation is not applicable to scalar value hello
+Error: 3001(EngineExecuteQuery), Error during planning: Sort operation is not applicable to scalar value hello

 -- Ambiguous reference in union alias, give and error in duckdb, but works in greptimedb
 SELECT a AS k, b FROM test UNION SELECT a, b AS k FROM test ORDER BY k;
@@ -54,7 +54,7 @@ Error: 3000(PlanQuery), Error during planning: Order by column out of bounds, sp

 SELECT a % 2, b FROM test UNION SELECT a % 2 AS k, b FROM test ORDER BY -1;

-Error: 1003(Internal), Error during planning: Sort operation is not applicable to scalar value -1
+Error: 3001(EngineExecuteQuery), Error during planning: Sort operation is not applicable to scalar value -1

 SELECT a % 2, b FROM test UNION SELECT a % 2 AS k FROM test ORDER BY -1;

--- a/tests/cases/standalone/common/range/by.result
+++ b/tests/cases/standalone/common/range/by.result
@@ -0,0 +1,54 @@
+CREATE TABLE host (
+  ts timestamp(3) time index,
+  host STRING PRIMARY KEY,
+  val BIGINT,
+);
+
+Affected Rows: 0
+
+INSERT INTO TABLE host VALUES
+    (0,     'host1', 0),
+    (5000,  'host1', null),
+    (10000, 'host1', 1),
+    (15000, 'host1', null),
+    (20000, 'host1', 2),
+    (0,     'host2', 3),
+    (5000,  'host2', null),
+    (10000, 'host2', 4),
+    (15000, 'host2', null),
+    (20000, 'host2', 5);
+
+Affected Rows: 10
+
+-- Test by calculate
+SELECT ts, length(host), max(val) RANGE '5s' FROM host ALIGN '20s' BY (length(host)) ORDER BY ts;
+
+---------------------+-----------------------------+----------------------------------+
+| ts                  | character_length(host.host) | MAX(host.val) RANGE 5s FILL NULL |
+---------------------+-----------------------------+----------------------------------+
+| 1970-01-01T00:00:00 | 5                           | 3                                |
+| 1970-01-01T00:00:20 | 5                           | 5                                |
+---------------------+-----------------------------+----------------------------------+
+
+SELECT ts, max(val) RANGE '5s' FROM host ALIGN '20s' BY (2) ORDER BY ts;
+
+---------------------+----------------------------------+
+| ts                  | MAX(host.val) RANGE 5s FILL NULL |
+---------------------+----------------------------------+
+| 1970-01-01T00:00:00 | 3                                |
+| 1970-01-01T00:00:20 | 5                                |
+---------------------+----------------------------------+
+
+SELECT ts, CAST(length(host) as INT64) + 2, max(val) RANGE '5s' FROM host ALIGN '20s' BY (CAST(length(host) as INT64) + 2) ORDER BY ts;
+
+---------------------+----------------------------------------+----------------------------------+
+| ts                  | character_length(host.host) + Int64(2) | MAX(host.val) RANGE 5s FILL NULL |
+---------------------+----------------------------------------+----------------------------------+
+| 1970-01-01T00:00:00 | 7                                      | 3                                |
+| 1970-01-01T00:00:20 | 7                                      | 5                                |
+---------------------+----------------------------------------+----------------------------------+
+
+DROP TABLE host;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/range/by.sql
+++ b/tests/cases/standalone/common/range/by.sql
@@ -0,0 +1,27 @@
+CREATE TABLE host (
+  ts timestamp(3) time index,
+  host STRING PRIMARY KEY,
+  val BIGINT,
+);
+
+INSERT INTO TABLE host VALUES
+    (0,     'host1', 0),
+    (5000,  'host1', null),
+    (10000, 'host1', 1),
+    (15000, 'host1', null),
+    (20000, 'host1', 2),
+    (0,     'host2', 3),
+    (5000,  'host2', null),
+    (10000, 'host2', 4),
+    (15000, 'host2', null),
+    (20000, 'host2', 5);
+
+-- Test by calculate
+
+SELECT ts, length(host), max(val) RANGE '5s' FROM host ALIGN '20s' BY (length(host)) ORDER BY ts;
+
+SELECT ts, max(val) RANGE '5s' FROM host ALIGN '20s' BY (2) ORDER BY ts;
+
+SELECT ts, CAST(length(host) as INT64) + 2, max(val) RANGE '5s' FROM host ALIGN '20s' BY (CAST(length(host) as INT64) + 2) ORDER BY ts;
+
+DROP TABLE host;
--- a/tests/cases/standalone/common/range/calculate.result
+++ b/tests/cases/standalone/common/range/calculate.result
@@ -0,0 +1,194 @@
+CREATE TABLE host (
+  ts timestamp(3) time index,
+  host STRING PRIMARY KEY,
+  val BIGINT,
+);
+
+Affected Rows: 0
+
+INSERT INTO TABLE host VALUES
+    (0,     'host1', 0),
+    (5000,  'host1', null),
+    (10000, 'host1', 1),
+    (15000, 'host1', null),
+    (20000, 'host1', 2),
+    (0,     'host2', 3),
+    (5000,  'host2', null),
+    (10000, 'host2', 4),
+    (15000, 'host2', null),
+    (20000, 'host2', 5);
+
+Affected Rows: 10
+
+-- Test range expr calculate
+SELECT ts, host, covar(val, val) RANGE '20s' FROM host ALIGN '10s' ORDER BY host, ts;
+
+---------------------+-------+---------------------------------------------------+
+| ts                  | host  | COVARIANCE(host.val,host.val) RANGE 20s FILL NULL |
+---------------------+-------+---------------------------------------------------+
+| 1970-01-01T00:00:00 | host1 |                                                   |
+| 1970-01-01T00:00:10 | host1 | 0.5                                               |
+| 1970-01-01T00:00:20 | host1 | 0.5                                               |
+| 1970-01-01T00:00:30 | host1 |                                                   |
+| 1970-01-01T00:00:00 | host2 |                                                   |
+| 1970-01-01T00:00:10 | host2 | 0.5                                               |
+| 1970-01-01T00:00:20 | host2 | 0.5                                               |
+| 1970-01-01T00:00:30 | host2 |                                                   |
+---------------------+-------+---------------------------------------------------+
+
+SELECT ts, host, 2 * min(val) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+---------------------+-------+---------------------------------------------+
+| ts                  | host  | Int64(2) * MIN(host.val) RANGE 5s FILL NULL |
+---------------------+-------+---------------------------------------------+
+| 1970-01-01T00:00:00 | host1 | 0                                           |
+| 1970-01-01T00:00:05 | host1 |                                             |
+| 1970-01-01T00:00:10 | host1 | 2                                           |
+| 1970-01-01T00:00:15 | host1 |                                             |
+| 1970-01-01T00:00:20 | host1 | 4                                           |
+| 1970-01-01T00:00:00 | host2 | 6                                           |
+| 1970-01-01T00:00:05 | host2 |                                             |
+| 1970-01-01T00:00:10 | host2 | 8                                           |
+| 1970-01-01T00:00:15 | host2 |                                             |
+| 1970-01-01T00:00:20 | host2 | 10                                          |
+---------------------+-------+---------------------------------------------+
+
+SELECT ts, host, min(val * 2) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+---------------------+-------+---------------------------------------------+
+| ts                  | host  | MIN(host.val * Int64(2)) RANGE 5s FILL NULL |
+---------------------+-------+---------------------------------------------+
+| 1970-01-01T00:00:00 | host1 | 0                                           |
+| 1970-01-01T00:00:05 | host1 |                                             |
+| 1970-01-01T00:00:10 | host1 | 2                                           |
+| 1970-01-01T00:00:15 | host1 |                                             |
+| 1970-01-01T00:00:20 | host1 | 4                                           |
+| 1970-01-01T00:00:00 | host2 | 6                                           |
+| 1970-01-01T00:00:05 | host2 |                                             |
+| 1970-01-01T00:00:10 | host2 | 8                                           |
+| 1970-01-01T00:00:15 | host2 |                                             |
+| 1970-01-01T00:00:20 | host2 | 10                                          |
+---------------------+-------+---------------------------------------------+
+
+SELECT ts, host, min(CAST(val as Float64)) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+---------------------+-------+----------------------------------+
+| ts                  | host  | MIN(host.val) RANGE 5s FILL NULL |
+---------------------+-------+----------------------------------+
+| 1970-01-01T00:00:00 | host1 | 0.0                              |
+| 1970-01-01T00:00:05 | host1 |                                  |
+| 1970-01-01T00:00:10 | host1 | 1.0                              |
+| 1970-01-01T00:00:15 | host1 |                                  |
+| 1970-01-01T00:00:20 | host1 | 2.0                              |
+| 1970-01-01T00:00:00 | host2 | 3.0                              |
+| 1970-01-01T00:00:05 | host2 |                                  |
+| 1970-01-01T00:00:10 | host2 | 4.0                              |
+| 1970-01-01T00:00:15 | host2 |                                  |
+| 1970-01-01T00:00:20 | host2 | 5.0                              |
+---------------------+-------+----------------------------------+
+
+SELECT ts, host, min(floor(CAST(val as Float64))) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+---------------------+-------+-----------------------------------------+
+| ts                  | host  | MIN(floor(host.val)) RANGE 5s FILL NULL |
+---------------------+-------+-----------------------------------------+
+| 1970-01-01T00:00:00 | host1 | 0.0                                     |
+| 1970-01-01T00:00:05 | host1 |                                         |
+| 1970-01-01T00:00:10 | host1 | 1.0                                     |
+| 1970-01-01T00:00:15 | host1 |                                         |
+| 1970-01-01T00:00:20 | host1 | 2.0                                     |
+| 1970-01-01T00:00:00 | host2 | 3.0                                     |
+| 1970-01-01T00:00:05 | host2 |                                         |
+| 1970-01-01T00:00:10 | host2 | 4.0                                     |
+| 1970-01-01T00:00:15 | host2 |                                         |
+| 1970-01-01T00:00:20 | host2 | 5.0                                     |
+---------------------+-------+-----------------------------------------+
+
+SELECT ts, host, floor(min(val) RANGE '5s') FROM host ALIGN '5s' ORDER BY host, ts;
+
+---------------------+-------+-----------------------------------------+
+| ts                  | host  | floor(MIN(host.val) RANGE 5s FILL NULL) |
+---------------------+-------+-----------------------------------------+
+| 1970-01-01T00:00:00 | host1 | 0.0                                     |
+| 1970-01-01T00:00:05 | host1 |                                         |
+| 1970-01-01T00:00:10 | host1 | 1.0                                     |
+| 1970-01-01T00:00:15 | host1 |                                         |
+| 1970-01-01T00:00:20 | host1 | 2.0                                     |
+| 1970-01-01T00:00:00 | host2 | 3.0                                     |
+| 1970-01-01T00:00:05 | host2 |                                         |
+| 1970-01-01T00:00:10 | host2 | 4.0                                     |
+| 1970-01-01T00:00:15 | host2 |                                         |
+| 1970-01-01T00:00:20 | host2 | 5.0                                     |
+---------------------+-------+-----------------------------------------+
+
+-- Test complex range expr calculate
+SELECT ts, host, (min(val) + max(val)) RANGE '20s' + 1.0 FROM host ALIGN '10s' ORDER BY host, ts;
+
+---------------------+-------+------------------------------------------------------------------------------------+
+| ts                  | host  | MIN(host.val) RANGE 20s FILL NULL + MAX(host.val) RANGE 20s FILL NULL + Float64(1) |
+---------------------+-------+------------------------------------------------------------------------------------+
+| 1970-01-01T00:00:00 | host1 | 1.0                                                                                |
+| 1970-01-01T00:00:10 | host1 | 2.0                                                                                |
+| 1970-01-01T00:00:20 | host1 | 4.0                                                                                |
+| 1970-01-01T00:00:30 | host1 | 5.0                                                                                |
+| 1970-01-01T00:00:00 | host2 | 7.0                                                                                |
+| 1970-01-01T00:00:10 | host2 | 8.0                                                                                |
+| 1970-01-01T00:00:20 | host2 | 10.0                                                                               |
+| 1970-01-01T00:00:30 | host2 | 11.0                                                                               |
+---------------------+-------+------------------------------------------------------------------------------------+
+
+SELECT ts, host, covar(ceil(CAST(val as Float64)), floor(CAST(val as Float64))) RANGE '20s' FROM host ALIGN '10s' ORDER BY host, ts;
+
+---------------------+-------+----------------------------------------------------------------+
+| ts                  | host  | COVARIANCE(ceil(host.val),floor(host.val)) RANGE 20s FILL NULL |
+---------------------+-------+----------------------------------------------------------------+
+| 1970-01-01T00:00:00 | host1 |                                                                |
+| 1970-01-01T00:00:10 | host1 | 0.5                                                            |
+| 1970-01-01T00:00:20 | host1 | 0.5                                                            |
+| 1970-01-01T00:00:30 | host1 |                                                                |
+| 1970-01-01T00:00:00 | host2 |                                                                |
+| 1970-01-01T00:00:10 | host2 | 0.5                                                            |
+| 1970-01-01T00:00:20 | host2 | 0.5                                                            |
+| 1970-01-01T00:00:30 | host2 |                                                                |
+---------------------+-------+----------------------------------------------------------------+
+
+SELECT ts, host, floor(cos(ceil(sin(min(val) RANGE '5s')))) FROM host ALIGN '5s' ORDER BY host, ts;
+
+---------------------+-------+---------------------------------------------------------+
+| ts                  | host  | floor(cos(ceil(sin(MIN(host.val) RANGE 5s FILL NULL)))) |
+---------------------+-------+---------------------------------------------------------+
+| 1970-01-01T00:00:00 | host1 | 1.0                                                     |
+| 1970-01-01T00:00:05 | host1 |                                                         |
+| 1970-01-01T00:00:10 | host1 | 0.0                                                     |
+| 1970-01-01T00:00:15 | host1 |                                                         |
+| 1970-01-01T00:00:20 | host1 | 0.0                                                     |
+| 1970-01-01T00:00:00 | host2 | 0.0                                                     |
+| 1970-01-01T00:00:05 | host2 |                                                         |
+| 1970-01-01T00:00:10 | host2 | 1.0                                                     |
+| 1970-01-01T00:00:15 | host2 |                                                         |
+| 1970-01-01T00:00:20 | host2 | 1.0                                                     |
+---------------------+-------+---------------------------------------------------------+
+
+SELECT ts, host, gcd(CAST(max(floor(CAST(val as Float64))) RANGE '10s' FILL PREV as INT64) * 4, max(val * 4) RANGE '10s' FILL PREV) * length(host) + 1 FROM host ALIGN '5s' ORDER BY host, ts;
+
+---------------------+-------+------------------------------------------------------------------------------------------------------------------------------------------------+
+| ts                  | host  | gcd(MAX(floor(host.val)) RANGE 10s FILL PREV * Int64(4),MAX(host.val * Int64(4)) RANGE 10s FILL PREV) * character_length(host.host) + Int64(1) |
+---------------------+-------+------------------------------------------------------------------------------------------------------------------------------------------------+
+| 1970-01-01T00:00:00 | host1 | 1                                                                                                                                              |
+| 1970-01-01T00:00:05 | host1 | 1                                                                                                                                              |
+| 1970-01-01T00:00:10 | host1 | 21                                                                                                                                             |
+| 1970-01-01T00:00:15 | host1 | 21                                                                                                                                             |
+| 1970-01-01T00:00:20 | host1 | 41                                                                                                                                             |
+| 1970-01-01T00:00:25 | host1 | 41                                                                                                                                             |
+| 1970-01-01T00:00:00 | host2 | 61                                                                                                                                             |
+| 1970-01-01T00:00:05 | host2 | 61                                                                                                                                             |
+| 1970-01-01T00:00:10 | host2 | 81                                                                                                                                             |
+| 1970-01-01T00:00:15 | host2 | 81                                                                                                                                             |
+| 1970-01-01T00:00:20 | host2 | 101                                                                                                                                            |
+| 1970-01-01T00:00:25 | host2 | 101                                                                                                                                            |
+---------------------+-------+------------------------------------------------------------------------------------------------------------------------------------------------+
+
+DROP TABLE host;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/range/calculate.sql
+++ b/tests/cases/standalone/common/range/calculate.sql
@@ -0,0 +1,43 @@
+CREATE TABLE host (
+  ts timestamp(3) time index,
+  host STRING PRIMARY KEY,
+  val BIGINT,
+);
+
+INSERT INTO TABLE host VALUES
+    (0,     'host1', 0),
+    (5000,  'host1', null),
+    (10000, 'host1', 1),
+    (15000, 'host1', null),
+    (20000, 'host1', 2),
+    (0,     'host2', 3),
+    (5000,  'host2', null),
+    (10000, 'host2', 4),
+    (15000, 'host2', null),
+    (20000, 'host2', 5);
+
+-- Test range expr calculate
+
+SELECT ts, host, covar(val, val) RANGE '20s' FROM host ALIGN '10s' ORDER BY host, ts;
+
+SELECT ts, host, 2 * min(val) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+SELECT ts, host, min(val * 2) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+SELECT ts, host, min(CAST(val as Float64)) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+SELECT ts, host, min(floor(CAST(val as Float64))) RANGE '5s' FROM host ALIGN '5s' ORDER BY host, ts;
+
+SELECT ts, host, floor(min(val) RANGE '5s') FROM host ALIGN '5s' ORDER BY host, ts;
+
+-- Test complex range expr calculate
+
+SELECT ts, host, (min(val) + max(val)) RANGE '20s' + 1.0 FROM host ALIGN '10s' ORDER BY host, ts;
+
+SELECT ts, host, covar(ceil(CAST(val as Float64)), floor(CAST(val as Float64))) RANGE '20s' FROM host ALIGN '10s' ORDER BY host, ts;
+
+SELECT ts, host, floor(cos(ceil(sin(min(val) RANGE '5s')))) FROM host ALIGN '5s' ORDER BY host, ts;
+
+SELECT ts, host, gcd(CAST(max(floor(CAST(val as Float64))) RANGE '10s' FILL PREV as INT64) * 4, max(val * 4) RANGE '10s' FILL PREV) * length(host) + 1 FROM host ALIGN '5s' ORDER BY host, ts;
+
+DROP TABLE host;
--- a/tests/cases/standalone/common/range/error.result
+++ b/tests/cases/standalone/common/range/error.result
@@ -0,0 +1,82 @@
+CREATE TABLE host (
+  ts timestamp(3) time index,
+  host STRING PRIMARY KEY,
+  val BIGINT,
+);
+
+Affected Rows: 0
+
+INSERT INTO TABLE host VALUES
+    (0,     'host1', 0),
+    (5000,  'host1', null),
+    (10000, 'host1', 1),
+    (15000, 'host1', null),
+    (20000, 'host1', 2),
+    (0,     'host2', 3),
+    (5000,  'host2', null),
+    (10000, 'host2', 4),
+    (15000, 'host2', null),
+    (20000, 'host2', 5);
+
+Affected Rows: 10
+
+-- Test Invalid cases
+-- 1. error timestamp
+SELECT min(val) RANGE 'not_time' FROM host ALIGN '5s';
+
+Error: 2000(InvalidSyntax), sql parser error: not a valid duration string: not_time
+
+SELECT min(val) RANGE '5s' FROM host ALIGN 'not_time';
+
+Error: 2000(InvalidSyntax), sql parser error: not a valid duration string: not_time
+
+-- 2.1 no range param
+SELECT min(val) FROM host ALIGN '5s';
+
+Error: 2000(InvalidSyntax), sql parser error: Illegal Range select, no RANGE keyword found in any SelectItem
+
+SELECT min(val) RANGE '10s', max(val) FROM host ALIGN '5s';
+
+Error: 3001(EngineExecuteQuery), No field named "MAX(host.val)". Valid fields are "MIN(host.val) RANGE 10s FILL NULL", host.ts, host.host.
+
+SELECT min(val) * 2 RANGE '10s' FROM host ALIGN '5s';
+
+Error: 2000(InvalidSyntax), sql parser error: Can't use the RANGE keyword in Expr 2 without function
+
+SELECT 1 RANGE '10s' FILL NULL FROM host ALIGN '1h' FILL NULL;
+
+Error: 2000(InvalidSyntax), sql parser error: Can't use the RANGE keyword in Expr 1 without function
+
+-- 2.2 no align param
+SELECT min(val) RANGE '5s' FROM host;
+
+Error: 3000(PlanQuery), Error during planning: Missing argument in range select query
+
+-- 2.3 type mismatch
+SELECT covar(ceil(val), floor(val)) RANGE '20s' FROM host ALIGN '10s';
+
+Error: 3001(EngineExecuteQuery), Internal error: Unsupported data type Int64 for function ceil. This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker
+
+-- 2.4 nest query
+SELECT min(max(val) RANGE '20s') RANGE '20s' FROM host ALIGN '10s';
+
+Error: 2000(InvalidSyntax), Range Query: Nest Range Query is not allowed
+
+-- 2.5 wrong Aggregate
+SELECT rank() OVER (PARTITION BY host ORDER BY ts DESC) RANGE '10s' FROM host ALIGN '5s';
+
+Error: 2000(InvalidSyntax), Range Query: Window functions is not allowed in Range Query
+
+-- 2.6 invalid fill
+SELECT min(val) RANGE '5s', min(val) RANGE '5s' FILL NULL FROM host ALIGN '5s';
+
+Error: 3001(EngineExecuteQuery), Schema contains duplicate unqualified field name "MIN(host.val) RANGE 5s FILL NULL"
+
+SELECT min(val) RANGE '5s' FROM host ALIGN '5s' FILL 3.0;
+
+Error: 3000(PlanQuery), Error during planning: 3.0 is not a valid fill option, fail to convert to a const value. { Arrow error: Cast error: Cannot cast string '3.0' to value of Int64 type }
+
+DROP TABLE host;
+
+Affected Rows: 0
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
liyang	01fdbf3626	chore: upgrade 0.4.2 (#2644 )	2023-10-24 12:21:58 +08:00
Lei, HUANG	97897aaf9b	fix: predicate shall use real schema to create physical exprs (#2642 ) * fix: prune predicate show use real schema to create physical exprs * refactor: remove redundant results * fix: unit tests * test: add more sqlness cases * test: add more sqlness cases * fix: sqlness orderby * chore: update log * fix: cache physical expr in memtable iter --------- Co-authored-by: Yingwen <realevenyag@gmail.com>	2023-10-24 03:41:25 +00:00
Wei	1fc42a681f	refactor: create_or_open always set writable (#2641 ) feat: set opened region writable	2023-10-23 10:32:51 +00:00
Wei	fbc8f56eaa	feat: lookup manifest file size (#2590 ) * feat: get manifest file size * feat: manifest size statistics * refactor: manifest map key * chore: comment and unit test * chore: remove no-use function * chore: change style * Apply suggestions from code review Co-authored-by: Yingwen <realevenyag@gmail.com> * chore: cr comment * chore: cr comment * chore: cr comment * chore: cr comment --------- Co-authored-by: Yingwen <realevenyag@gmail.com>	2023-10-23 08:59:00 +00:00
yuanbohan	44280f7c9d	feat(otlp): initial OTLP trace support (#2627 ) * feat: otlp tracing framework via http * feat: otlp trace transformer plugin * feat: successfully write traces into db * chore: plugin to parse request * test: helper functions * feat: parse_request_to_spans function * chore: remove implicite calling parse in PraceParser * chore: fix clippy * chore: add TODO marker for span fields * refactor TraceParser trait * refactor TraceParser trait * table_name method in OTLP TraceParser trait * fix: approximate row, column count * chore: function signature without row * chore: do not clone by moving span.kind upper * docs for parse and to_grpc_insert_requests --------- Co-authored-by: fys <fengys1996@gmail.com> Co-authored-by: fys <40801205+fengys1996@users.noreply.github.com>	2023-10-23 06:37:43 +00:00
Ning Sun	0fbde48655	feat: hide internal error and unknown error message from end user (#2544 ) * feat: use fixed error message for unknown error * feat: return fixed message for internal error as well * chore: include status code in error message * test: update tests for asserts of error message * feat: change status code of some datafusion error * fix: make CollectRecordbatch an query error * test: update sqlness results	2023-10-23 03:07:35 +00:00
Niwaka	9dcfd28f61	feat: impl ObjectStoreManager for custom_storage (#2621 ) * feat: impl ObjectStoreManager for custom_storage * fix: rename object_store_manager to manager * fix: rename global to default * chore: add document for ObjectStoreManager * refactor: simplify default_object_store * fix: address review	2023-10-23 03:00:29 +00:00
Yingwen	82dbc3e1ae	feat(mito): Ports InMemoryRowGroup from parquet crate (#2633 ) * feat: ports InMemoryRowGroup from parquet * chore: pub InMemoryRowGroup * style: allow some clippy lints	2023-10-23 02:22:19 +00:00
Ruihang Xia	4d478658b5	fix: pass datanode config file in distributed mode sqlness (#2631 ) * fix: pass datanode config file in distributed mode sqlness Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2023-10-20 10:57:23 +00:00
localhost	89ebe47cd9	feat: RepeatedTask adds execute-first-wait-later behavior. (#2625 ) * feat: RepeatedTask adds execute-first-wait-later behavior. * feat: add inverval generator for repeate task component * feat: impl debug for dyn IntervalGenerator trait * chore: change some words * chore: instead of complicated way, we add an initial_delay to control task interval * chore: some improve by pr comment	2023-10-20 09:43:45 +00:00
Ruihang Xia	212ea2c25c	feat: implement `HistogramFold` plan for prometheus histogram type (#2626 ) * basic impl of fold plan Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add schema test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fill plan attributes Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix styles Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * unify variable names Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2023-10-20 07:42:10 +00:00
Ruihang Xia	1658d088ab	ci: add size labeler (#2628 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2023-10-20 06:39:13 +00:00
Baasit	346b57cf10	feat: row protocol support for opentsdb (#2623 ) * feat: opentsdb row protocol * fix: added commnets for num of rows and failure if output is not of affecetd rows * fix: added extra 1 to number of columns * fix: avoided cloning datapoints, took ownership instead * fix: avoided cloning datapoints, took ownership instead * fix: changed vecotr slice to vector * fix: remove clone * fix: combined datapoints and requests with zip instead of enumerating --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-43-183.us-east-2.compute.internal>	2023-10-20 06:25:59 +00:00
Weny Xu	e1dcf83326	fix: correct the range behavior in MemoryKvBackend & RaftEngineBackend (#2615 ) * fix: correct the range behavior in MemoryKvBackend & RaftEngineBackend * refactor: migrate tests from MemoryKvBackend * chore: apply suggestions from CR * fix: fix license header * chore: apply suggestions from CR * chore: apply suggestions from CR * fix: fix range bugs	2023-10-20 02:30:47 +00:00
Ning Sun	b5d9d635eb	ci: add slack notification for nightly ci failure (#2617 )	2023-10-19 15:47:15 +00:00
zyy17	88dd78a69c	ci: remove the old version python (#2624 ) ci: remove old version python	2023-10-19 15:46:15 +00:00
zyy17	6439b929b3	ci: the 'publish-github-release' and 'release-cn-artifacts' have to wait for all the artifacts are built (#2622 )	2023-10-19 21:05:44 +08:00
Wei	ba15c14103	feat: get internal value size of ValueRef (#2613 ) * feat: impl byte_size * chore: clippy * chore: cr comment	2023-10-19 11:59:37 +08:00
Weny Xu	d57b144b2f	chore: change test_remove_outdated_meta_task sleep time to 40ms (#2620 ) chore: change test_remove_outdated_meta_task sleep time to 300ms	2023-10-18 11:33:35 +00:00
WU Jingdi	46e106bcc3	feat: allow nest range expr in Range Query (#2557 ) * feat: eable range expr nest * fix: change range expr rewrite format * chore: organize range query tests * chore: change range expr name(e.g. MAX(v) RANGE 5s FILL 6) * chore: add range query test * chore: fix code advice * chore: fix ca	2023-10-18 07:03:26 +00:00
localhost	a7507a2b12	chore: change telemetry report url to resolve connectivity issues (#2608 ) chore: change otel report url to resolve connectivity issues	2023-10-18 06:58:54 +00:00
Wei	5b8e5066a0	refactor: make ReadableSize more readable. (#2614 ) * refactor: ReadableSize is readable. * docs: Update src/common/base/src/readable_size.rs --------- Co-authored-by: Yingwen <realevenyag@gmail.com>	2023-10-18 06:32:50 +00:00
Weny Xu	dcd481e6a4	feat: stop the procedure manager if a new leader is elected (#2576 ) * feat: stop the procedure manager if a new leader is elected * chore: apply suggestions from CR * chore: apply suggestions * chore: apply suggestions from CR * feat: add should_report to GreptimeDBTelemetry Signed-off-by: WenyXu <wenymedia@gmail.com> * refactor: refactor subscribing leader change loop --------- Signed-off-by: WenyXu <wenymedia@gmail.com>	2023-10-18 06:12:28 +00:00
zyy17	3217b56cc1	ci: release new version '0.4.0' -> '0.4.1' (#2611 )	2023-10-17 07:33:41 +00:00
shuiyisong	eccad647d0	chore: add export data to migrate tool (#2610 ) * chore: add export data to migrate tool * chore: export copy from sql too	2023-10-17 06:33:58 +00:00
Yun Chen	829db8c5c1	fix!: align frontend cmd name to rpc_* (#2609 ) fix: align frontend cmd name to rpc_*	2023-10-17 06:18:18 +00:00
Ruihang Xia	9056c3a6aa	feat: implement greptime cli export (#2535 ) * feat: implement greptime cli export Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * read information schema Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * parse database name from cli params Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2023-10-17 01:56:52 +00:00
ZhangJian He	d9e7b898a3	feat: add walconfig dir back (#2606 ) Signed-off-by: ZhangJian He <shoothzj@gmail.com>	2023-10-16 11:26:06 +00:00
zyy17	59d4081f7a	ci: correct image name of dev build (#2603 )	2023-10-16 03:54:44 +00:00