revert toml format

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Merge branch 'main' into fix-proto-clear
2026-01-06 13:22:57 +00:00 · 2024-03-14 00:46:00 +08:00 · 2024-03-14 00:36:28 +08:00 · 2024-03-14 00:32:49 +08:00 · 2024-03-13 23:32:22 +08:00 · 2024-03-13 14:11:47 +00:00
199 changed files with 8346 additions and 1730 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,10 @@
 root = true
 [*]
 end_of_line = lf
 indent_style = space
 insert_final_newline = true
 trim_trailing_whitespace = true
 [{Makefile,**.mk}]
 indent_style = tab
--- a/.env.example
+++ b/.env.example
@@ -21,3 +21,6 @@ GT_GCS_CREDENTIAL_PATH = GCS credential path
 GT_GCS_ENDPOINT = GCS end point
 # Settings for kafka wal test
 GT_KAFKA_ENDPOINTS = localhost:9092
 # Setting for fuzz tests
 GT_MYSQL_ADDR = localhost:4002
--- a/.github/actions/fuzz-test/action.yaml
+++ b/.github/actions/fuzz-test/action.yaml
@@ -0,0 +1,13 @@
 name: Fuzz Test
 description: 'Fuzz test given setup and service'
 inputs:
  target:
    description: "The fuzz target to test"
 runs:
  using: composite
  steps:
  - name: Run Fuzz Test
    shell: bash
    run: cargo fuzz run ${{ inputs.target }} --fuzz-dir tests-fuzz -D -s none -- -max_total_time=120
    env:
      GT_MYSQL_ADDR: 127.0.0.1:4002
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -102,7 +102,7 @@ jobs:
          shared-key: "build-binaries"
      - name: Build greptime binaries
        shell: bash
-        run: cargo build
+        run: cargo build --bin greptime --bin sqlness-runner
      - name: Pack greptime binaries
        shell: bash
        run: |
@@ -117,6 +117,46 @@ jobs:
          artifacts-dir: bins
          version: current
  fuzztest: 
    name: Fuzz Test
    needs: build
    runs-on: ubuntu-latest
    strategy:
      matrix:
        target: [ "fuzz_create_table", "fuzz_alter_table" ]
    steps:
      - uses: actions/checkout@v4
      - uses: arduino/setup-protoc@v3
      - uses: dtolnay/rust-toolchain@master
        with:
          toolchain: ${{ env.RUST_TOOLCHAIN }}
      - name: Rust Cache
        uses: Swatinem/rust-cache@v2
        with:
          # Shares across multiple jobs
          shared-key: "fuzz-test-targets"
      - name: Set Rust Fuzz
        shell: bash
        run: |
          sudo apt update && sudo apt install -y libfuzzer-14-dev
          cargo install cargo-fuzz
      - name: Download pre-built binaries
        uses: actions/download-artifact@v4
        with:
          name: bins
          path: .
      - name: Unzip binaries
        run: tar -xvf ./bins.tar.gz
      - name: Run GreptimeDB
        run: | 
          ./bins/greptime standalone start&
      - name: Fuzz Test
        uses: ./.github/actions/fuzz-test
        env:
          CUSTOM_LIBFUZZER_PATH: /usr/lib/llvm-14/lib/libFuzzer.a
        with:
          target: ${{ matrix.target }}
  sqlness:
    name: Sqlness Test
    needs: build
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,7 @@ benchmarks/data
 *.code-workspace
 venv/
 # Fuzz tests 
 tests-fuzz/artifacts/
 tests-fuzz/corpus/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -29,6 +29,17 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
 [[package]]
 name = "aes"
 version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
 dependencies = [
 "cfg-if 1.0.0",
 "cipher",
 "cpufeatures",
 ]
 [[package]]
 name = "ahash"
 version = "0.7.7"
@@ -241,6 +252,15 @@ dependencies = [
 "syn 1.0.109",
 ]
 [[package]]
 name = "arbitrary"
 version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
 dependencies = [
 "derive_arbitrary",
 ]
 [[package]]
 name = "arc-swap"
 version = "1.6.0"
@@ -992,6 +1012,15 @@ dependencies = [
 "generic-array",
 ]
 [[package]]
 name = "block-padding"
 version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
 dependencies = [
 "generic-array",
 ]
 [[package]]
 name = "borsh"
 version = "1.3.0"
@@ -1266,6 +1295,15 @@ dependencies = [
 "tokio",
 ]
 [[package]]
 name = "cbc"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
 dependencies = [
 "cipher",
 ]
 [[package]]
 name = "cc"
 version = "1.0.83"
@@ -1421,6 +1459,16 @@ dependencies = [
 "half 1.8.2",
 ]
 [[package]]
 name = "cipher"
 version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
 dependencies = [
 "crypto-common",
 "inout",
 ]
 [[package]]
 name = "clang-sys"
 version = "1.6.1"
@@ -2912,6 +2960,17 @@ dependencies = [
 "syn 2.0.43",
 ]
 [[package]]
 name = "derive_arbitrary"
 version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 2.0.43",
 ]
 [[package]]
 name = "derive_builder"
 version = "0.11.2"
@@ -3415,10 +3474,12 @@ dependencies = [
 "common-telemetry",
 "common-time",
 "datatypes",
 "enum_dispatch",
 "hydroflow",
 "itertools 0.10.5",
 "num-traits",
 "serde",
 "serde_json",
 "servers",
 "session",
 "snafu",
@@ -4406,6 +4467,16 @@ dependencies = [
 "libc",
 ]
 [[package]]
 name = "inout"
 version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
 dependencies = [
 "block-padding",
 "generic-array",
 ]
 [[package]]
 name = "instant"
 version = "0.1.12"
@@ -4746,9 +4817,20 @@ dependencies = [
 [[package]]
 name = "libc"
-version = "0.2.151"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 [[package]]
 name = "libfuzzer-sys"
 version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
 dependencies = [
 "arbitrary",
 "cc",
 "once_cell",
 ]
 [[package]]
 name = "libgit2-sys"
@@ -5989,9 +6071,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 [[package]]
 name = "opendal"
-version = "0.44.2"
+version = "0.45.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4af824652d4d2ffabf606d337a071677ae621b05622adf35df9562f69d9b4498"
+checksum = "52c17c077f23fa2d2c25d9d22af98baa43b8bbe2ef0de80cf66339aa70401467"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -6007,7 +6089,7 @@ dependencies = [
 "md-5",
 "once_cell",
 "percent-encoding",
- "quick-xml 0.30.0",
+ "quick-xml 0.31.0",
 "reqsign",
 "reqwest",
 "serde",
@@ -6500,6 +6582,16 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
 [[package]]
 name = "pbkdf2"
 version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
 dependencies = [
 "digest",
 "hmac",
 ]
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -6540,6 +6632,12 @@ version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 [[package]]
 name = "permutation"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7"
 [[package]]
 name = "pest"
 version = "2.7.5"
@@ -6724,6 +6822,21 @@ dependencies = [
 "spki 0.7.3",
 ]
 [[package]]
 name = "pkcs5"
 version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e847e2c91a18bfa887dd028ec33f2fe6f25db77db3619024764914affe8b69a6"
 dependencies = [
 "aes",
 "cbc",
 "der 0.7.8",
 "pbkdf2",
 "scrypt",
 "sha2",
 "spki 0.7.3",
 ]
 [[package]]
 name = "pkcs8"
 version = "0.8.0"
@@ -6742,6 +6855,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
 dependencies = [
 "der 0.7.8",
 "pkcs5",
 "rand_core",
 "spki 0.7.3",
 ]
@@ -7456,16 +7571,6 @@ dependencies = [
 "memchr",
 ]
 [[package]]
 name = "quick-xml"
 version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
 dependencies = [
 "memchr",
 "serde",
 ]
 [[package]]
 name = "quick-xml"
 version = "0.31.0"
@@ -7748,9 +7853,9 @@ dependencies = [
 [[package]]
 name = "reqsign"
-version = "0.14.6"
+version = "0.14.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dce87f66ba6c6acef277a729f989a0eca946cb9ce6a15bcc036bda0f72d4b9fd"
+checksum = "43e319d9de9ff4d941abf4ac718897118b0fe04577ea3f8e0f5788971784eef5"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -7775,7 +7880,6 @@ dependencies = [
 "serde_json",
 "sha1",
 "sha2",
 "tokio",
 ]
 [[package]]
@@ -7968,6 +8072,7 @@ dependencies = [
 "pkcs1 0.7.5",
 "pkcs8 0.10.2",
 "rand_core",
 "sha2",
 "signature",
 "spki 0.7.3",
 "subtle",
@@ -8702,6 +8807,15 @@ dependencies = [
 "bytemuck",
 ]
 [[package]]
 name = "salsa20"
 version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213"
 dependencies = [
 "cipher",
 ]
 [[package]]
 name = "same-file"
 version = "1.0.6"
@@ -8815,6 +8929,17 @@ dependencies = [
 "tokio-test",
 ]
 [[package]]
 name = "scrypt"
 version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f"
 dependencies = [
 "pbkdf2",
 "salsa20",
 "sha2",
 ]
 [[package]]
 name = "sct"
 version = "0.7.1"
@@ -9074,6 +9199,7 @@ dependencies = [
 "derive_builder 0.12.0",
 "digest",
 "futures",
 "hashbrown 0.14.3",
 "headers",
 "hex",
 "hostname",
@@ -9092,6 +9218,7 @@ dependencies = [
 "opensrv-mysql",
 "opentelemetry-proto 0.3.0",
 "parking_lot 0.12.1",
 "permutation",
 "pgwire",
 "pin-project",
 "postgres-types",
@@ -10092,15 +10219,19 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
 name = "tests-fuzz"
 version = "0.7.0"
 dependencies = [
 "arbitrary",
 "async-trait",
 "common-error",
 "common-macro",
 "common-query",
 "common-runtime",
 "common-telemetry",
 "common-time",
 "datatypes",
 "derive_builder 0.12.0",
 "dotenv",
 "lazy_static",
 "libfuzzer-sys",
 "partition",
 "rand",
 "rand_chacha",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -134,7 +134,7 @@ reqwest = { version = "0.11", default-features = false, features = [
 rskafka = "0.5"
 rust_decimal = "1.33"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0"
+serde_json = { version = "1.0", features = ["float_roundtrip"] }
 serde_with = "3"
 smallvec = { version = "1", features = ["serde"] }
 snafu = "0.7"
--- a/5
+++ b/5
@@ -3,6 +3,7 @@ CARGO_PROFILE ?=
 FEATURES ?=
 TARGET_DIR ?=
 TARGET ?=
 BUILD_BIN ?= greptime
 CARGO_BUILD_OPTS := --locked
 IMAGE_REGISTRY ?= docker.io
 IMAGE_NAMESPACE ?= greptime
@@ -45,6 +46,10 @@ ifneq ($(strip $(TARGET)),)
 	CARGO_BUILD_OPTS += --target ${TARGET}
 endif
 ifneq ($(strip $(BUILD_BIN)),)
 	CARGO_BUILD_OPTS += --bin ${BUILD_BIN}
 endif
 ifneq ($(strip $(RELEASE)),)
 	CARGO_BUILD_OPTS += --release
 endif
--- a/benchmarks/src/bin/nyc-taxi.rs
+++ b/benchmarks/src/bin/nyc-taxi.rs
@@ -29,7 +29,7 @@ use client::api::v1::column::Values;
 use client::api::v1::{
    Column, ColumnDataType, ColumnDef, CreateTableExpr, InsertRequest, InsertRequests, SemanticType,
 };
-use client::{Client, Database, Output, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use futures_util::TryStreamExt;
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
@@ -502,9 +502,9 @@ async fn do_query(num_iter: usize, db: &Database, table_name: &str) {
        for i in 0..num_iter {
            let now = Instant::now();
            let res = db.sql(&query).await.unwrap();
-            match res {
+            match res.data {
-                Output::AffectedRows(_) | Output::RecordBatches(_) => (),
+                OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => (),
-                Output::Stream(stream, _) => {
+                OutputData::Stream(stream) => {
                    stream.try_collect::<Vec<_>>().await.unwrap();
                }
            }
--- a/docs/rfcs/2023-05-09-distributed-planner.md
+++ b/docs/rfcs/2023-05-09-distributed-planner.md
@@ -79,7 +79,7 @@ This RFC proposes to add a new expression node `MergeScan` to merge result from
 │               │    │                             │
 └─Frontend──────┘    └─Remote-Sources──────────────┘
 ```
-This merge operation simply chains all the the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.
+This merge operation simply chains all the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.
 ## Commutativity of MergeScan
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -307,7 +307,7 @@ impl Database {
                        reason: "Expect 'AffectedRows' Flight messages to be the one and the only!"
                    }
                );
-                Ok(Output::AffectedRows(rows))
+                Ok(Output::new_with_affected_rows(rows))
            }
            FlightMessage::Recordbatch(_) | FlightMessage::Metrics(_) => {
                IllegalFlightMessagesSnafu {
@@ -340,7 +340,7 @@ impl Database {
                    output_ordering: None,
                    metrics: Default::default(),
                };
-                Ok(Output::new_stream(Box::pin(record_batch_stream)))
+                Ok(Output::new_with_stream(Box::pin(record_batch_stream)))
            }
        }
    }
--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -26,7 +26,7 @@ use api::v1::greptime_response::Response;
 use api::v1::{AffectedRows, GreptimeResponse};
 pub use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_error::status_code::StatusCode;
-pub use common_query::Output;
+pub use common_query::{Output, OutputData, OutputMeta};
 pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
 use snafu::OptionExt;
--- a/src/cmd/src/cli/bench.rs
+++ b/src/cmd/src/cli/bench.rs
@@ -62,7 +62,9 @@ pub struct BenchTableMetadataCommand {
 impl BenchTableMetadataCommand {
    pub async fn build(&self) -> Result<Instance> {
-        let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr]).await.unwrap();
+        let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr], 128)
            .await
            .unwrap();
        let table_metadata_manager = Arc::new(TableMetadataManager::new(etcd_store));
--- a/src/cmd/src/cli/export.rs
+++ b/src/cmd/src/cli/export.rs
@@ -19,8 +19,7 @@ use async_trait::async_trait;
 use clap::{Parser, ValueEnum};
 use client::api::v1::auth_header::AuthScheme;
 use client::api::v1::Basic;
-use client::{Client, Database, DEFAULT_SCHEMA_NAME};
+use client::{Client, Database, OutputData, DEFAULT_SCHEMA_NAME};
 use common_query::Output;
 use common_recordbatch::util::collect;
 use common_telemetry::{debug, error, info, warn};
 use datatypes::scalars::ScalarVector;
@@ -142,7 +141,7 @@ impl Export {
                    .with_context(|_| RequestDatabaseSnafu {
                        sql: "show databases".to_string(),
                    })?;
-            let Output::Stream(stream, _) = result else {
+            let OutputData::Stream(stream) = result.data else {
                NotDataFromOutputSnafu.fail()?
            };
            let record_batch = collect(stream)
@@ -183,7 +182,7 @@ impl Export {
            .sql(&sql)
            .await
            .with_context(|_| RequestDatabaseSnafu { sql })?;
-        let Output::Stream(stream, _) = result else {
+        let OutputData::Stream(stream) = result.data else {
            NotDataFromOutputSnafu.fail()?
        };
        let Some(record_batch) = collect(stream)
@@ -235,7 +234,7 @@ impl Export {
            .sql(&sql)
            .await
            .with_context(|_| RequestDatabaseSnafu { sql })?;
-        let Output::Stream(stream, _) = result else {
+        let OutputData::Stream(stream) = result.data else {
            NotDataFromOutputSnafu.fail()?
        };
        let record_batch = collect(stream)
--- a/src/cmd/src/cli/repl.rs
+++ b/src/cmd/src/cli/repl.rs
@@ -19,7 +19,7 @@ use std::time::Instant;
 use catalog::kvbackend::{
    CachedMetaKvBackend, CachedMetaKvBackendBuilder, KvBackendCatalogManager,
 };
-use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_base::Plugins;
 use common_error::ext::ErrorExt;
 use common_query::Output;
@@ -184,15 +184,15 @@ impl Repl {
        }
        .context(RequestDatabaseSnafu { sql: &sql })?;
-        let either = match output {
+        let either = match output.data {
-            Output::Stream(s, _) => {
+            OutputData::Stream(s) => {
                let x = RecordBatches::try_collect(s)
                    .await
                    .context(CollectRecordBatchesSnafu)?;
                Either::Left(x)
            }
-            Output::RecordBatches(x) => Either::Left(x),
+            OutputData::RecordBatches(x) => Either::Left(x),
-            Output::AffectedRows(rows) => Either::Right(rows),
+            OutputData::AffectedRows(rows) => Either::Right(rows),
        };
        let end = Instant::now();
--- a/src/cmd/src/cli/upgrade.rs
+++ b/src/cmd/src/cli/upgrade.rs
@@ -70,7 +70,7 @@ impl UpgradeCommand {
                etcd_addr: &self.etcd_addr,
            })?;
        let tool = MigrateTableMetadata {
-            etcd_store: EtcdStore::with_etcd_client(client),
+            etcd_store: EtcdStore::with_etcd_client(client, 128),
            dryrun: self.dryrun,
            skip_catalog_keys: self.skip_catalog_keys,
            skip_table_global_keys: self.skip_table_global_keys,
--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -117,10 +117,12 @@ struct StartCommand {
    /// The working home directory of this metasrv instance.
    #[clap(long)]
    data_home: Option<String>,
    /// If it's not empty, the metasrv will store all data with this key prefix.
    #[clap(long, default_value = "")]
    store_key_prefix: String,
    /// The max operations per txn
    #[clap(long)]
    max_txn_ops: Option<usize>,
 }
 impl StartCommand {
@@ -181,6 +183,10 @@ impl StartCommand {
            opts.store_key_prefix = self.store_key_prefix.clone()
        }
        if let Some(max_txn_ops) = self.max_txn_ops {
            opts.max_txn_ops = max_txn_ops;
        }
        // Disable dashboard in metasrv.
        opts.http.disable_dashboard = true;
--- a/src/common/datasource/src/object_store/s3.rs
+++ b/src/common/datasource/src/object_store/s3.rs
@@ -28,12 +28,15 @@ const REGION: &str = "region";
 const ENABLE_VIRTUAL_HOST_STYLE: &str = "enable_virtual_host_style";
 pub fn is_supported_in_s3(key: &str) -> bool {
-    key == ENDPOINT
+    [
-        || key == ACCESS_KEY_ID
+        ENDPOINT,
-        || key == SECRET_ACCESS_KEY
+        ACCESS_KEY_ID,
-        || key == SESSION_TOKEN
+        SECRET_ACCESS_KEY,
-        || key == REGION
+        SESSION_TOKEN,
-        || key == ENABLE_VIRTUAL_HOST_STYLE
+        REGION,
        ENABLE_VIRTUAL_HOST_STYLE,
    ]
    .contains(&key)
 }
 pub fn build_s3_backend(
--- a/src/common/function/src/scalars/math.rs
+++ b/src/common/function/src/scalars/math.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 mod clamp;
 mod modulo;
 mod pow;
 mod rate;
@@ -19,6 +20,7 @@ mod rate;
 use std::fmt;
 use std::sync::Arc;
 pub use clamp::ClampFunction;
 use common_query::error::{GeneralDataFusionSnafu, Result};
 use common_query::prelude::Signature;
 use datafusion::error::DataFusionError;
@@ -40,7 +42,8 @@ impl MathFunction {
        registry.register(Arc::new(ModuloFunction));
        registry.register(Arc::new(PowFunction));
        registry.register(Arc::new(RateFunction));
-        registry.register(Arc::new(RangeFunction))
+        registry.register(Arc::new(RangeFunction));
        registry.register(Arc::new(ClampFunction));
    }
 }
--- a/src/common/function/src/scalars/math/clamp.rs
+++ b/src/common/function/src/scalars/math/clamp.rs
@@ -0,0 +1,403 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::fmt::{self, Display};
 use std::sync::Arc;
 use common_query::error::{InvalidFuncArgsSnafu, Result};
 use common_query::prelude::Signature;
 use datafusion::arrow::array::{ArrayIter, PrimitiveArray};
 use datafusion::logical_expr::Volatility;
 use datatypes::data_type::{ConcreteDataType, DataType};
 use datatypes::prelude::VectorRef;
 use datatypes::types::LogicalPrimitiveType;
 use datatypes::value::TryAsPrimitive;
 use datatypes::vectors::PrimitiveVector;
 use datatypes::with_match_primitive_type_id;
 use snafu::{ensure, OptionExt};
 use crate::function::Function;
 #[derive(Clone, Debug, Default)]
 pub struct ClampFunction;
 const CLAMP_NAME: &str = "clamp";
 impl Function for ClampFunction {
    fn name(&self) -> &str {
        CLAMP_NAME
    }
    fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
        // Type check is done by `signature`
        Ok(input_types[0].clone())
    }
    fn signature(&self) -> Signature {
        // input, min, max
        Signature::uniform(3, ConcreteDataType::numerics(), Volatility::Immutable)
    }
    fn eval(
        &self,
        _func_ctx: crate::function::FunctionContext,
        columns: &[VectorRef],
    ) -> Result<VectorRef> {
        ensure!(
            columns.len() == 3,
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "The length of the args is not correct, expect exactly 3, have: {}",
                    columns.len()
                ),
            }
        );
        ensure!(
            columns[0].data_type().is_numeric(),
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "The first arg's type is not numeric, have: {}",
                    columns[0].data_type()
                ),
            }
        );
        ensure!(
            columns[0].data_type() == columns[1].data_type()
                && columns[1].data_type() == columns[2].data_type(),
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "Arguments don't have identical types: {}, {}, {}",
                    columns[0].data_type(),
                    columns[1].data_type(),
                    columns[2].data_type()
                ),
            }
        );
        ensure!(
            columns[1].len() == 1 && columns[2].len() == 1,
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "The second and third args should be scalar, have: {:?}, {:?}",
                    columns[1], columns[2]
                ),
            }
        );
        with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| {
            let input_array = columns[0].to_arrow_array();
            let input = input_array
                    .as_any()
                    .downcast_ref::<PrimitiveArray<<$S as LogicalPrimitiveType>::ArrowPrimitive>>()
                    .unwrap();
            let min = TryAsPrimitive::<$S>::try_as_primitive(&columns[1].get(0))
                .with_context(|| {
                    InvalidFuncArgsSnafu {
                        err_msg: "The second arg should not be none",
                    }
                })?;
            let max = TryAsPrimitive::<$S>::try_as_primitive(&columns[2].get(0))
                .with_context(|| {
                    InvalidFuncArgsSnafu {
                        err_msg: "The third arg should not be none",
                    }
                })?;
            // ensure min <= max
            ensure!(
                min <= max,
                    InvalidFuncArgsSnafu {
                        err_msg: format!(
                        "The second arg should be less than or equal to the third arg, have: {:?}, {:?}",
                        columns[1], columns[2]
                    ),
                }
            );
            clamp_impl::<$S, true, true>(input, min, max)
        },{
            unreachable!()
        })
    }
 }
 impl Display for ClampFunction {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}", CLAMP_NAME.to_ascii_uppercase())
    }
 }
 fn clamp_impl<T: LogicalPrimitiveType, const CLAMP_MIN: bool, const CLAMP_MAX: bool>(
    input: &PrimitiveArray<T::ArrowPrimitive>,
    min: T::Native,
    max: T::Native,
 ) -> Result<VectorRef> {
    common_telemetry::info!("[DEBUG] min {min:?}, max {max:?}");
    let iter = ArrayIter::new(input);
    let result = iter.map(|x| {
        x.map(|x| {
            if CLAMP_MIN && x < min {
                min
            } else if CLAMP_MAX && x > max {
                max
            } else {
                x
            }
        })
    });
    let result = PrimitiveArray::<T::ArrowPrimitive>::from_iter(result);
    Ok(Arc::new(PrimitiveVector::<T>::from(result)))
 }
 #[cfg(test)]
 mod test {
    use std::sync::Arc;
    use datatypes::prelude::ScalarVector;
    use datatypes::vectors::{
        ConstantVector, Float64Vector, Int64Vector, StringVector, UInt64Vector,
    };
    use super::*;
    use crate::function::FunctionContext;
    #[test]
    fn clamp_i64() {
        let inputs = [
            (
                vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
                -1,
                10,
                vec![Some(-1), Some(-1), Some(-1), Some(0), Some(1), Some(2)],
            ),
            (
                vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
                0,
                0,
                vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
            ),
            (
                vec![Some(-3), None, Some(-1), None, None, Some(2)],
                -2,
                1,
                vec![Some(-2), None, Some(-1), None, None, Some(1)],
            ),
            (
                vec![None, None, None, None, None],
                0,
                1,
                vec![None, None, None, None, None],
            ),
        ];
        let func = ClampFunction;
        for (in_data, min, max, expected) in inputs {
            let args = [
                Arc::new(Int64Vector::from(in_data)) as _,
                Arc::new(Int64Vector::from_vec(vec![min])) as _,
                Arc::new(Int64Vector::from_vec(vec![max])) as _,
            ];
            let result = func
                .eval(FunctionContext::default(), args.as_slice())
                .unwrap();
            let expected: VectorRef = Arc::new(Int64Vector::from(expected));
            assert_eq!(expected, result);
        }
    }
    #[test]
    fn clamp_u64() {
        let inputs = [
            (
                vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
                1,
                3,
                vec![Some(1), Some(1), Some(2), Some(3), Some(3), Some(3)],
            ),
            (
                vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
                0,
                0,
                vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
            ),
            (
                vec![Some(0), None, Some(2), None, None, Some(5)],
                1,
                3,
                vec![Some(1), None, Some(2), None, None, Some(3)],
            ),
            (
                vec![None, None, None, None, None],
                0,
                1,
                vec![None, None, None, None, None],
            ),
        ];
        let func = ClampFunction;
        for (in_data, min, max, expected) in inputs {
            let args = [
                Arc::new(UInt64Vector::from(in_data)) as _,
                Arc::new(UInt64Vector::from_vec(vec![min])) as _,
                Arc::new(UInt64Vector::from_vec(vec![max])) as _,
            ];
            let result = func
                .eval(FunctionContext::default(), args.as_slice())
                .unwrap();
            let expected: VectorRef = Arc::new(UInt64Vector::from(expected));
            assert_eq!(expected, result);
        }
    }
    #[test]
    fn clamp_f64() {
        let inputs = [
            (
                vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
                -1.0,
                10.0,
                vec![Some(-1.0), Some(-1.0), Some(-1.0), Some(0.0), Some(1.0)],
            ),
            (
                vec![Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
                0.0,
                0.0,
                vec![Some(0.0), Some(0.0), Some(0.0), Some(0.0)],
            ),
            (
                vec![Some(-3.0), None, Some(-1.0), None, None, Some(2.0)],
                -2.0,
                1.0,
                vec![Some(-2.0), None, Some(-1.0), None, None, Some(1.0)],
            ),
            (
                vec![None, None, None, None, None],
                0.0,
                1.0,
                vec![None, None, None, None, None],
            ),
        ];
        let func = ClampFunction;
        for (in_data, min, max, expected) in inputs {
            let args = [
                Arc::new(Float64Vector::from(in_data)) as _,
                Arc::new(Float64Vector::from_vec(vec![min])) as _,
                Arc::new(Float64Vector::from_vec(vec![max])) as _,
            ];
            let result = func
                .eval(FunctionContext::default(), args.as_slice())
                .unwrap();
            let expected: VectorRef = Arc::new(Float64Vector::from(expected));
            assert_eq!(expected, result);
        }
    }
    #[test]
    fn clamp_const_i32() {
        let input = vec![Some(5)];
        let min = 2;
        let max = 4;
        let func = ClampFunction;
        let args = [
            Arc::new(ConstantVector::new(Arc::new(Int64Vector::from(input)), 1)) as _,
            Arc::new(Int64Vector::from_vec(vec![min])) as _,
            Arc::new(Int64Vector::from_vec(vec![max])) as _,
        ];
        let result = func
            .eval(FunctionContext::default(), args.as_slice())
            .unwrap();
        let expected: VectorRef = Arc::new(Int64Vector::from(vec![Some(4)]));
        assert_eq!(expected, result);
    }
    #[test]
    fn clamp_invalid_min_max() {
        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
        let min = 10.0;
        let max = -1.0;
        let func = ClampFunction;
        let args = [
            Arc::new(Float64Vector::from(input)) as _,
            Arc::new(Float64Vector::from_vec(vec![min])) as _,
            Arc::new(Float64Vector::from_vec(vec![max])) as _,
        ];
        let result = func.eval(FunctionContext::default(), args.as_slice());
        assert!(result.is_err());
    }
    #[test]
    fn clamp_type_not_match() {
        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
        let min = -1;
        let max = 10;
        let func = ClampFunction;
        let args = [
            Arc::new(Float64Vector::from(input)) as _,
            Arc::new(Int64Vector::from_vec(vec![min])) as _,
            Arc::new(UInt64Vector::from_vec(vec![max])) as _,
        ];
        let result = func.eval(FunctionContext::default(), args.as_slice());
        assert!(result.is_err());
    }
    #[test]
    fn clamp_min_is_not_scalar() {
        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
        let min = -10.0;
        let max = 1.0;
        let func = ClampFunction;
        let args = [
            Arc::new(Float64Vector::from(input)) as _,
            Arc::new(Float64Vector::from_vec(vec![min, min])) as _,
            Arc::new(Float64Vector::from_vec(vec![max])) as _,
        ];
        let result = func.eval(FunctionContext::default(), args.as_slice());
        assert!(result.is_err());
    }
    #[test]
    fn clamp_no_max() {
        let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
        let min = -10.0;
        let func = ClampFunction;
        let args = [
            Arc::new(Float64Vector::from(input)) as _,
            Arc::new(Float64Vector::from_vec(vec![min])) as _,
        ];
        let result = func.eval(FunctionContext::default(), args.as_slice());
        assert!(result.is_err());
    }
    #[test]
    fn clamp_on_string() {
        let input = vec![Some("foo"), Some("foo"), Some("foo"), Some("foo")];
        let func = ClampFunction;
        let args = [
            Arc::new(StringVector::from(input)) as _,
            Arc::new(StringVector::from_vec(vec!["bar"])) as _,
            Arc::new(StringVector::from_vec(vec!["baz"])) as _,
        ];
        let result = func.eval(FunctionContext::default(), args.as_slice());
        assert!(result.is_err());
    }
 }
--- a/src/common/function/src/scalars/timestamp.rs
+++ b/src/common/function/src/scalars/timestamp.rs
@@ -14,9 +14,11 @@
 use std::sync::Arc;
 mod greatest;
 mod to_timezone;
 mod to_unixtime;
 use greatest::GreatestFunction;
 use to_timezone::ToTimezoneFunction;
 use to_unixtime::ToUnixtimeFunction;
 use crate::function_registry::FunctionRegistry;
@@ -25,6 +27,7 @@ pub(crate) struct TimestampFunction;
 impl TimestampFunction {
    pub fn register(registry: &FunctionRegistry) {
        registry.register(Arc::new(ToTimezoneFunction));
        registry.register(Arc::new(ToUnixtimeFunction));
        registry.register(Arc::new(GreatestFunction));
    }
--- a/src/common/function/src/scalars/timestamp/to_timezone.rs
+++ b/src/common/function/src/scalars/timestamp/to_timezone.rs
@@ -0,0 +1,260 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::fmt;
 use std::sync::Arc;
 use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
 use common_query::prelude::Signature;
 use common_time::{Timestamp, Timezone};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::prelude::VectorRef;
 use datatypes::types::TimestampType;
 use datatypes::value::Value;
 use datatypes::vectors::{
    StringVector, TimestampMicrosecondVector, TimestampMillisecondVector,
    TimestampNanosecondVector, TimestampSecondVector, Vector,
 };
 use snafu::{ensure, OptionExt};
 use crate::function::{Function, FunctionContext};
 use crate::helper;
 #[derive(Clone, Debug, Default)]
 pub struct ToTimezoneFunction;
 const NAME: &str = "to_timezone";
 fn convert_to_timezone(arg: &str) -> Option<Timezone> {
    Timezone::from_tz_string(arg).ok()
 }
 fn convert_to_timestamp(arg: &Value) -> Option<Timestamp> {
    match arg {
        Value::Timestamp(ts) => Some(*ts),
        _ => None,
    }
 }
 impl fmt::Display for ToTimezoneFunction {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "TO_TIMEZONE")
    }
 }
 impl Function for ToTimezoneFunction {
    fn name(&self) -> &str {
        NAME
    }
    fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
        // type checked by signature - MUST BE timestamp
        Ok(input_types[0].clone())
    }
    fn signature(&self) -> Signature {
        helper::one_of_sigs2(
            vec![
                ConcreteDataType::timestamp_second_datatype(),
                ConcreteDataType::timestamp_millisecond_datatype(),
                ConcreteDataType::timestamp_microsecond_datatype(),
                ConcreteDataType::timestamp_nanosecond_datatype(),
            ],
            vec![ConcreteDataType::string_datatype()],
        )
    }
    fn eval(&self, _ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
        ensure!(
            columns.len() == 2,
            InvalidFuncArgsSnafu {
                err_msg: format!(
                    "The length of the args is not correct, expect exactly 2, have: {}",
                    columns.len()
                ),
            }
        );
        // TODO: maybe support epoch timestamp? https://github.com/GreptimeTeam/greptimedb/issues/3477
        let ts = columns[0].data_type().as_timestamp().with_context(|| {
            UnsupportedInputDataTypeSnafu {
                function: NAME,
                datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
            }
        })?;
        let array = columns[0].to_arrow_array();
        let times = match ts {
            TimestampType::Second(_) => {
                let vector = TimestampSecondVector::try_from_arrow_array(array).unwrap();
                (0..vector.len())
                    .map(|i| convert_to_timestamp(&vector.get(i)))
                    .collect::<Vec<_>>()
            }
            TimestampType::Millisecond(_) => {
                let vector = TimestampMillisecondVector::try_from_arrow_array(array).unwrap();
                (0..vector.len())
                    .map(|i| convert_to_timestamp(&vector.get(i)))
                    .collect::<Vec<_>>()
            }
            TimestampType::Microsecond(_) => {
                let vector = TimestampMicrosecondVector::try_from_arrow_array(array).unwrap();
                (0..vector.len())
                    .map(|i| convert_to_timestamp(&vector.get(i)))
                    .collect::<Vec<_>>()
            }
            TimestampType::Nanosecond(_) => {
                let vector = TimestampNanosecondVector::try_from_arrow_array(array).unwrap();
                (0..vector.len())
                    .map(|i| convert_to_timestamp(&vector.get(i)))
                    .collect::<Vec<_>>()
            }
        };
        let tzs = {
            let array = columns[1].to_arrow_array();
            let vector = StringVector::try_from_arrow_array(&array)
                .ok()
                .with_context(|| UnsupportedInputDataTypeSnafu {
                    function: NAME,
                    datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
                })?;
            (0..vector.len())
                .map(|i| convert_to_timezone(&vector.get(i).to_string()))
                .collect::<Vec<_>>()
        };
        let result = times
            .iter()
            .zip(tzs.iter())
            .map(|(time, tz)| match (time, tz) {
                (Some(time), _) => Some(time.to_timezone_aware_string(tz.as_ref())),
                _ => None,
            })
            .collect::<Vec<Option<String>>>();
        Ok(Arc::new(StringVector::from(result)))
    }
 }
 #[cfg(test)]
 mod tests {
    use datatypes::scalars::ScalarVector;
    use datatypes::timestamp::{
        TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond,
    };
    use datatypes::vectors::StringVector;
    use super::*;
    #[test]
    fn test_timestamp_to_timezone() {
        let f = ToTimezoneFunction;
        assert_eq!("to_timezone", f.name());
        let results = vec![
            Some("1969-12-31 19:00:01"),
            None,
            Some("1970-01-01 03:00:01"),
            None,
        ];
        let times: Vec<Option<TimestampSecond>> = vec![
            Some(TimestampSecond::new(1)),
            None,
            Some(TimestampSecond::new(1)),
            None,
        ];
        let ts_vector: TimestampSecondVector =
            TimestampSecondVector::from_owned_iterator(times.into_iter());
        let tzs = vec![Some("America/New_York"), None, Some("Europe/Moscow"), None];
        let args: Vec<VectorRef> = vec![
            Arc::new(ts_vector),
            Arc::new(StringVector::from(tzs.clone())),
        ];
        let vector = f.eval(FunctionContext::default(), &args).unwrap();
        assert_eq!(4, vector.len());
        let expect_times: VectorRef = Arc::new(StringVector::from(results));
        assert_eq!(expect_times, vector);
        let results = vec![
            Some("1969-12-31 19:00:00.001"),
            None,
            Some("1970-01-01 03:00:00.001"),
            None,
        ];
        let times: Vec<Option<TimestampMillisecond>> = vec![
            Some(TimestampMillisecond::new(1)),
            None,
            Some(TimestampMillisecond::new(1)),
            None,
        ];
        let ts_vector: TimestampMillisecondVector =
            TimestampMillisecondVector::from_owned_iterator(times.into_iter());
        let args: Vec<VectorRef> = vec![
            Arc::new(ts_vector),
            Arc::new(StringVector::from(tzs.clone())),
        ];
        let vector = f.eval(FunctionContext::default(), &args).unwrap();
        assert_eq!(4, vector.len());
        let expect_times: VectorRef = Arc::new(StringVector::from(results));
        assert_eq!(expect_times, vector);
        let results = vec![
            Some("1969-12-31 19:00:00.000001"),
            None,
            Some("1970-01-01 03:00:00.000001"),
            None,
        ];
        let times: Vec<Option<TimestampMicrosecond>> = vec![
            Some(TimestampMicrosecond::new(1)),
            None,
            Some(TimestampMicrosecond::new(1)),
            None,
        ];
        let ts_vector: TimestampMicrosecondVector =
            TimestampMicrosecondVector::from_owned_iterator(times.into_iter());
        let args: Vec<VectorRef> = vec![
            Arc::new(ts_vector),
            Arc::new(StringVector::from(tzs.clone())),
        ];
        let vector = f.eval(FunctionContext::default(), &args).unwrap();
        assert_eq!(4, vector.len());
        let expect_times: VectorRef = Arc::new(StringVector::from(results));
        assert_eq!(expect_times, vector);
        let results = vec![
            Some("1969-12-31 19:00:00.000000001"),
            None,
            Some("1970-01-01 03:00:00.000000001"),
            None,
        ];
        let times: Vec<Option<TimestampNanosecond>> = vec![
            Some(TimestampNanosecond::new(1)),
            None,
            Some(TimestampNanosecond::new(1)),
            None,
        ];
        let ts_vector: TimestampNanosecondVector =
            TimestampNanosecondVector::from_owned_iterator(times.into_iter());
        let args: Vec<VectorRef> = vec![
            Arc::new(ts_vector),
            Arc::new(StringVector::from(tzs.clone())),
        ];
        let vector = f.eval(FunctionContext::default(), &args).unwrap();
        assert_eq!(4, vector.len());
        let expect_times: VectorRef = Arc::new(StringVector::from(results));
        assert_eq!(expect_times, vector);
    }
 }
--- a/src/common/macro/src/admin_fn.rs
+++ b/src/common/macro/src/admin_fn.rs
@@ -32,7 +32,7 @@ macro_rules! ok {
    };
 }
-/// Internal util macro to to create an error.
+/// Internal util macro to create an error.
 macro_rules! error {
    ($span:expr, $msg: expr) => {
        Err(syn::Error::new($span, $msg))
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -67,6 +67,14 @@ pub enum Error {
        location: Location,
    },
    #[snafu(display("Failed to execute {} txn operations via Etcd", max_operations))]
    EtcdTxnFailed {
        max_operations: usize,
        #[snafu(source)]
        error: etcd_client::Error,
        location: Location,
    },
    #[snafu(display("Failed to get sequence: {}", err_msg))]
    NextSequence { err_msg: String, location: Location },
@@ -400,6 +408,7 @@ impl ErrorExt for Error {
            IllegalServerState { .. }
            | EtcdTxnOpResponse { .. }
            | EtcdFailed { .. }
            | EtcdTxnFailed { .. }
            | ConnectEtcd { .. } => StatusCode::Internal,
            SerdeJson { .. }
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -464,7 +464,7 @@ impl TableMetadataManager {
    pub fn max_logical_tables_per_batch(&self) -> usize {
        // The batch size is max_txn_size / 3 because the size of the `tables_data`
        // is 3 times the size of the `tables_data`.
-        self.kv_backend.max_txn_size() / 3
+        self.kv_backend.max_txn_ops() / 3
    }
    /// Creates metadata for multiple logical tables and return an error if different metadata exists.
@@ -860,6 +860,7 @@ mod tests {
    use bytes::Bytes;
    use common_time::util::current_time_millis;
    use futures::TryStreamExt;
    use store_api::storage::RegionId;
    use table::metadata::{RawTableInfo, TableInfo};
    use super::datanode_table::DatanodeTableKey;
@@ -1056,6 +1057,36 @@ mod tests {
        );
    }
    #[tokio::test]
    async fn test_create_many_logical_tables_metadata() {
        let kv_backend = Arc::new(MemoryKvBackend::default());
        let table_metadata_manager = TableMetadataManager::new(kv_backend);
        let mut tables_data = vec![];
        for i in 0..128 {
            let table_id = i + 1;
            let regin_number = table_id * 3;
            let region_id = RegionId::new(table_id, regin_number);
            let region_route = new_region_route(region_id.as_u64(), 2);
            let region_routes = vec![region_route.clone()];
            let table_info: RawTableInfo = test_utils::new_test_table_info_with_name(
                table_id,
                &format!("my_table_{}", table_id),
                region_routes.iter().map(|r| r.region.id.region_number()),
            )
            .into();
            let table_route_value = TableRouteValue::physical(region_routes.clone());
            tables_data.push((table_info, table_route_value));
        }
        // creates metadata.
        table_metadata_manager
            .create_logical_tables_metadata(tables_data)
            .await
            .unwrap();
    }
    #[tokio::test]
    async fn test_delete_table_metadata() {
        let mem_kv = Arc::new(MemoryKvBackend::default());
--- a/src/common/meta/src/key/test_utils.rs
+++ b/src/common/meta/src/key/test_utils.rs
@@ -19,8 +19,9 @@ use datatypes::schema::{ColumnSchema, SchemaBuilder};
 use store_api::storage::TableId;
 use table::metadata::{TableInfo, TableInfoBuilder, TableMetaBuilder};
-pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
+pub fn new_test_table_info_with_name<I: IntoIterator<Item = u32>>(
    table_id: TableId,
    table_name: &str,
    region_numbers: I,
 ) -> TableInfo {
    let column_schemas = vec![
@@ -50,8 +51,14 @@ pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
    TableInfoBuilder::default()
        .table_id(table_id)
        .table_version(5)
-        .name("mytable")
+        .name(table_name)
        .meta(meta)
        .build()
        .unwrap()
 }
 pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
    table_id: TableId,
    region_numbers: I,
 ) -> TableInfo {
    new_test_table_info_with_name(table_id, "mytable", region_numbers)
 }
--- a/src/common/meta/src/kv_backend/chroot.rs
+++ b/src/common/meta/src/kv_backend/chroot.rs
@@ -45,6 +45,10 @@ impl TxnService for ChrootKvBackend {
        let txn_res = self.inner.txn(txn).await?;
        Ok(self.chroot_txn_response(txn_res))
    }
    fn max_txn_ops(&self) -> usize {
        self.inner.max_txn_ops()
    }
 }
 #[async_trait::async_trait]
--- a/src/common/meta/src/kv_backend/etcd.rs
+++ b/src/common/meta/src/kv_backend/etcd.rs
@@ -33,12 +33,6 @@ use crate::rpc::store::{
 };
 use crate::rpc::KeyValue;
 // Maximum number of operations permitted in a transaction.
 // The etcd default configuration's `--max-txn-ops` is 128.
 //
 // For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
 const MAX_TXN_SIZE: usize = 128;
 fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {
    let (key, value) = kv.into_key_value();
    KeyValue { key, value }
@@ -46,10 +40,15 @@ fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {
 pub struct EtcdStore {
    client: Client,
    // Maximum number of operations permitted in a transaction.
    // The etcd default configuration's `--max-txn-ops` is 128.
    //
    // For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
    max_txn_ops: usize,
 }
 impl EtcdStore {
-    pub async fn with_endpoints<E, S>(endpoints: S) -> Result<KvBackendRef>
+    pub async fn with_endpoints<E, S>(endpoints: S, max_txn_ops: usize) -> Result<KvBackendRef>
    where
        E: AsRef<str>,
        S: AsRef<[E]>,
@@ -58,16 +57,19 @@ impl EtcdStore {
            .await
            .context(error::ConnectEtcdSnafu)?;
-        Ok(Self::with_etcd_client(client))
+        Ok(Self::with_etcd_client(client, max_txn_ops))
    }
-    pub fn with_etcd_client(client: Client) -> KvBackendRef {
+    pub fn with_etcd_client(client: Client, max_txn_ops: usize) -> KvBackendRef {
-        Arc::new(Self { client })
+        Arc::new(Self {
            client,
            max_txn_ops,
        })
    }
    async fn do_multi_txn(&self, txn_ops: Vec<TxnOp>) -> Result<Vec<TxnResponse>> {
-        let max_txn_size = self.max_txn_size();
+        let max_txn_ops = self.max_txn_ops();
-        if txn_ops.len() < max_txn_size {
+        if txn_ops.len() < max_txn_ops {
            // fast path
            let _timer = METRIC_META_TXN_REQUEST
                .with_label_values(&["etcd", "txn"])
@@ -83,7 +85,7 @@ impl EtcdStore {
        }
        let txns = txn_ops
-            .chunks(max_txn_size)
+            .chunks(max_txn_ops)
            .map(|part| async move {
                let _timer = METRIC_META_TXN_REQUEST
                    .with_label_values(&["etcd", "txn"])
@@ -311,18 +313,20 @@ impl TxnService for EtcdStore {
            .with_label_values(&["etcd", "txn"])
            .start_timer();
        let max_operations = txn.max_operations();
        let etcd_txn: Txn = txn.into();
        let txn_res = self
            .client
            .kv_client()
            .txn(etcd_txn)
            .await
-            .context(error::EtcdFailedSnafu)?;
+            .context(error::EtcdTxnFailedSnafu { max_operations })?;
        txn_res.try_into()
    }
-    fn max_txn_size(&self) -> usize {
+    fn max_txn_ops(&self) -> usize {
-        MAX_TXN_SIZE
+        self.max_txn_ops
    }
 }
--- a/src/common/meta/src/kv_backend/memory.rs
+++ b/src/common/meta/src/kv_backend/memory.rs
@@ -323,6 +323,10 @@ impl<T: ErrorExt + Send + Sync> TxnService for MemoryKvBackend<T> {
            responses,
        })
    }
    fn max_txn_ops(&self) -> usize {
        usize::MAX
    }
 }
 impl<T: ErrorExt + Send + Sync + 'static> ResettableKvBackend for MemoryKvBackend<T> {
--- a/src/common/meta/src/kv_backend/txn.rs
+++ b/src/common/meta/src/kv_backend/txn.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::cmp::max;
 use common_error::ext::ErrorExt;
 use crate::rpc::store::{DeleteRangeResponse, PutResponse, RangeResponse};
@@ -27,8 +29,8 @@ pub trait TxnService: Sync + Send {
    }
    /// Maximum number of operations permitted in a transaction.
-    fn max_txn_size(&self) -> usize {
+    fn max_txn_ops(&self) -> usize {
-        usize::MAX
+        unimplemented!("txn is not implemented")
    }
 }
@@ -192,6 +194,12 @@ impl Txn {
        self.req.failure = operations.into();
        self
    }
    #[inline]
    pub fn max_operations(&self) -> usize {
        let opc = max(self.req.compare.len(), self.req.success.len());
        max(opc, self.req.failure.len())
    }
 }
 impl From<Txn> for TxnRequest {
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -152,7 +152,7 @@ impl Runner {
            guard.key_guards.push(key_guard);
        }
-        // Execute the procedure. We need to release the lock whenever the the execution
+        // Execute the procedure. We need to release the lock whenever the execution
        // is successful or fail.
        self.execute_procedure_in_loop().await;
--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -30,38 +30,87 @@ pub mod prelude;
 mod signature;
 use sqlparser_derive::{Visit, VisitMut};
-// sql output
+/// new Output struct with output data(previously Output) and output meta
-pub enum Output {
+#[derive(Debug)]
 pub struct Output {
    pub data: OutputData,
    pub meta: OutputMeta,
 }
 /// Original Output struct
 /// carrying result data to response/client/user interface
 pub enum OutputData {
    AffectedRows(usize),
    RecordBatches(RecordBatches),
-    Stream(SendableRecordBatchStream, Option<Arc<dyn PhysicalPlan>>),
+    Stream(SendableRecordBatchStream),
 }
 /// OutputMeta stores meta information produced/generated during the execution
 #[derive(Debug, Default)]
 pub struct OutputMeta {
    /// May exist for query output. One can retrieve execution metrics from this plan.
    pub plan: Option<Arc<dyn PhysicalPlan>>,
    pub cost: usize,
 }
 impl Output {
-    // helper function to build original `Output::Stream`
+    pub fn new_with_affected_rows(affected_rows: usize) -> Self {
-    pub fn new_stream(stream: SendableRecordBatchStream) -> Self {
+        Self {
-        Output::Stream(stream, None)
+            data: OutputData::AffectedRows(affected_rows),
            meta: Default::default(),
        }
    }
    pub fn new_with_record_batches(recordbatches: RecordBatches) -> Self {
        Self {
            data: OutputData::RecordBatches(recordbatches),
            meta: Default::default(),
        }
    }
    pub fn new_with_stream(stream: SendableRecordBatchStream) -> Self {
        Self {
            data: OutputData::Stream(stream),
            meta: Default::default(),
        }
    }
    pub fn new(data: OutputData, meta: OutputMeta) -> Self {
        Self { data, meta }
    }
 }
-impl Debug for Output {
+impl Debug for OutputData {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
-            Output::AffectedRows(rows) => write!(f, "Output::AffectedRows({rows})"),
+            OutputData::AffectedRows(rows) => write!(f, "OutputData::AffectedRows({rows})"),
-            Output::RecordBatches(recordbatches) => {
+            OutputData::RecordBatches(recordbatches) => {
-                write!(f, "Output::RecordBatches({recordbatches:?})")
+                write!(f, "OutputData::RecordBatches({recordbatches:?})")
            }
-            Output::Stream(_, df) => {
+            OutputData::Stream(_) => {
-                if df.is_some() {
+                write!(f, "OutputData::Stream(<stream>)")
                    write!(f, "Output::Stream(<stream>, Some<physical_plan>)")
                } else {
                    write!(f, "Output::Stream(<stream>)")
                }
            }
        }
    }
 }
 impl OutputMeta {
    pub fn new(plan: Option<Arc<dyn PhysicalPlan>>, cost: usize) -> Self {
        Self { plan, cost }
    }
    pub fn new_with_plan(plan: Arc<dyn PhysicalPlan>) -> Self {
        Self {
            plan: Some(plan),
            cost: 0,
        }
    }
    pub fn new_with_cost(cost: usize) -> Self {
        Self { plan: None, cost }
    }
 }
 pub use datafusion::physical_plan::ExecutionPlan as DfPhysicalPlan;
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -32,7 +32,7 @@ use snafu::ResultExt;
 use crate::error::{self, Result};
 use crate::{
-    DfRecordBatch, DfSendableRecordBatchStream, RecordBatch, RecordBatchStream,
+    DfRecordBatch, DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream,
    SendableRecordBatchStream, Stream,
 };
@@ -228,6 +228,10 @@ impl RecordBatchStream for RecordBatchStreamAdapter {
            Metrics::Unavailable | Metrics::Unresolved(_) => None,
        }
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        None
    }
 }
 impl Stream for RecordBatchStreamAdapter {
@@ -316,6 +320,14 @@ impl RecordBatchStream for AsyncRecordBatchStreamAdapter {
    fn schema(&self) -> SchemaRef {
        self.schema.clone()
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        None
    }
    fn metrics(&self) -> Option<RecordBatchMetrics> {
        None
    }
 }
 impl Stream for AsyncRecordBatchStreamAdapter {
@@ -375,6 +387,14 @@ mod test {
            fn schema(&self) -> SchemaRef {
                unimplemented!()
            }
            fn output_ordering(&self) -> Option<&[OrderOption]> {
                None
            }
            fn metrics(&self) -> Option<RecordBatchMetrics> {
                None
            }
        }
        impl Stream for MaybeErrorRecordBatchStream {
--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -39,13 +39,9 @@ use snafu::{ensure, ResultExt};
 pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
    fn schema(&self) -> SchemaRef;
-    fn output_ordering(&self) -> Option<&[OrderOption]> {
+    fn output_ordering(&self) -> Option<&[OrderOption]>;
        None
    }
-    fn metrics(&self) -> Option<RecordBatchMetrics> {
+    fn metrics(&self) -> Option<RecordBatchMetrics>;
        None
    }
 }
 pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;
@@ -74,6 +70,14 @@ impl RecordBatchStream for EmptyRecordBatchStream {
    fn schema(&self) -> SchemaRef {
        self.schema.clone()
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        None
    }
    fn metrics(&self) -> Option<RecordBatchMetrics> {
        None
    }
 }
 impl Stream for EmptyRecordBatchStream {
@@ -192,6 +196,14 @@ impl RecordBatchStream for SimpleRecordBatchStream {
    fn schema(&self) -> SchemaRef {
        self.inner.schema()
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        None
    }
    fn metrics(&self) -> Option<RecordBatchMetrics> {
        None
    }
 }
 impl Stream for SimpleRecordBatchStream {
--- a/src/common/recordbatch/src/util.rs
+++ b/src/common/recordbatch/src/util.rs
@@ -41,7 +41,8 @@ mod tests {
    use futures::Stream;
    use super::*;
-    use crate::RecordBatchStream;
+    use crate::adapter::RecordBatchMetrics;
    use crate::{OrderOption, RecordBatchStream};
    struct MockRecordBatchStream {
        batch: Option<RecordBatch>,
@@ -52,6 +53,14 @@ mod tests {
        fn schema(&self) -> SchemaRef {
            self.schema.clone()
        }
        fn output_ordering(&self) -> Option<&[OrderOption]> {
            None
        }
        fn metrics(&self) -> Option<RecordBatchMetrics> {
            None
        }
    }
    impl Stream for MockRecordBatchStream {
--- a/src/common/test-util/src/recordbatch.rs
+++ b/src/common/test-util/src/recordbatch.rs
@@ -13,7 +13,7 @@
 // limitations under the License.
 use client::Database;
-use common_query::Output;
+use common_query::OutputData;
 use common_recordbatch::util;
 pub enum ExpectedOutput<'a> {
@@ -23,22 +23,24 @@ pub enum ExpectedOutput<'a> {
 pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) {
    let output = db.sql(sql).await.unwrap();
    let output = output.data;
    match (&output, expected) {
-        (Output::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
+        (OutputData::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
            assert_eq!(*x, y, "actual: \n{}", x)
        }
-        (Output::RecordBatches(_), ExpectedOutput::QueryResult(x))
+        (OutputData::RecordBatches(_), ExpectedOutput::QueryResult(x))
-        | (Output::Stream(_, _), ExpectedOutput::QueryResult(x)) => {
+        | (OutputData::Stream(_), ExpectedOutput::QueryResult(x)) => {
            check_output_stream(output, x).await
        }
        _ => panic!(),
    }
 }
-pub async fn check_output_stream(output: Output, expected: &str) {
+pub async fn check_output_stream(output: OutputData, expected: &str) {
    let recordbatches = match output {
-        Output::Stream(stream, _) => util::collect_batches(stream).await.unwrap(),
+        OutputData::Stream(stream) => util::collect_batches(stream).await.unwrap(),
-        Output::RecordBatches(recordbatches) => recordbatches,
+        OutputData::RecordBatches(recordbatches) => recordbatches,
        _ => unreachable!(),
    };
    let pretty_print = recordbatches.pretty_print().unwrap();
--- a/src/common/time/src/timestamp.rs
+++ b/src/common/time/src/timestamp.rs
@@ -36,7 +36,7 @@ use crate::{error, Interval};
 /// - for [TimeUnit::Second]: [-262144-01-01 00:00:00, +262143-12-31 23:59:59]
 /// - for [TimeUnit::Millisecond]: [-262144-01-01 00:00:00.000, +262143-12-31 23:59:59.999]
 /// - for [TimeUnit::Microsecond]: [-262144-01-01 00:00:00.000000, +262143-12-31 23:59:59.999999]
-/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145225, 2262-04-11 23:47:16.854775807]
+/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145224192, 2262-04-11 23:47:16.854775807]
 ///
 /// # Note:
 /// For values out of range, you can still store these timestamps, but while performing arithmetic
@@ -187,28 +187,28 @@ impl Timestamp {
        Self { unit, value }
    }
-    pub fn new_second(value: i64) -> Self {
+    pub const fn new_second(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Second,
        }
    }
-    pub fn new_millisecond(value: i64) -> Self {
+    pub const fn new_millisecond(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Millisecond,
        }
    }
-    pub fn new_microsecond(value: i64) -> Self {
+    pub const fn new_microsecond(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Microsecond,
        }
    }
-    pub fn new_nanosecond(value: i64) -> Self {
+    pub const fn new_nanosecond(value: i64) -> Self {
        Self {
            value,
            unit: TimeUnit::Nanosecond,
@@ -281,8 +281,26 @@ impl Timestamp {
                .and_then(|v| v.checked_add(micros as i64))
                .map(Timestamp::new_microsecond)
        } else {
            // Refer to <https://github.com/chronotope/chrono/issues/1289>
            //
            // subsec nanos are always non-negative, however the timestamp itself (both in seconds and in nanos) can be
            // negative. Now i64::MIN is NOT dividable by 1_000_000_000, so
            //
            //   (sec * 1_000_000_000) + nsec
            //
            // may underflow (even when in theory we COULD represent the datetime as i64) because we add the non-negative
            // nanos AFTER the multiplication. This is fixed by converting the negative case to
            //
            //   ((sec + 1) * 1_000_000_000) + (nsec - 1_000_000_000)
            let mut sec = sec;
            let mut nsec = nsec as i64;
            if sec < 0 && nsec > 0 {
                nsec -= 1_000_000_000;
                sec += 1;
            }
            sec.checked_mul(1_000_000_000)
-                .and_then(|v| v.checked_add(nsec as i64))
+                .and_then(|v| v.checked_add(nsec))
                .map(Timestamp::new_nanosecond)
        }
    }
@@ -425,6 +443,20 @@ impl Timestamp {
    }
 }
 impl Timestamp {
    pub const MIN_SECOND: Self = Self::new_second(-8_334_601_228_800);
    pub const MAX_SECOND: Self = Self::new_second(8_210_266_876_799);
    pub const MIN_MILLISECOND: Self = Self::new_millisecond(-8_334_601_228_800_000);
    pub const MAX_MILLISECOND: Self = Self::new_millisecond(8_210_266_876_799_999);
    pub const MIN_MICROSECOND: Self = Self::new_microsecond(-8_334_601_228_800_000_000);
    pub const MAX_MICROSECOND: Self = Self::new_microsecond(8_210_266_876_799_999_999);
    pub const MIN_NANOSECOND: Self = Self::new_nanosecond(i64::MIN);
    pub const MAX_NANOSECOND: Self = Self::new_nanosecond(i64::MAX);
 }
 /// Converts the naive datetime (which has no specific timezone) to a
 /// nanosecond epoch timestamp in UTC.
 fn naive_datetime_to_timestamp(
@@ -586,6 +618,7 @@ impl Hash for Timestamp {
 mod tests {
    use std::collections::hash_map::DefaultHasher;
    use chrono_tz::Tz;
    use rand::Rng;
    use serde_json::Value;
@@ -1297,7 +1330,7 @@ mod tests {
            "+262142-12-31 23:59:59Z",
            "+262142-12-31 23:59:59.999Z",
            "+262142-12-31 23:59:59.999999Z",
-            "1677-09-21 00:12:43.145225Z",
+            "1677-09-21 00:12:43.145224192Z",
            "2262-04-11 23:47:16.854775807Z",
            "+100000-01-01 00:00:01.5Z",
        ];
@@ -1306,4 +1339,47 @@ mod tests {
            Timestamp::from_str_utc(s).unwrap();
        }
    }
    #[test]
    fn test_min_nanos_roundtrip() {
        let (sec, nsec) = Timestamp::MIN_NANOSECOND.split();
        let ts = Timestamp::from_splits(sec, nsec).unwrap();
        assert_eq!(Timestamp::MIN_NANOSECOND, ts);
    }
    #[test]
    fn test_timestamp_bound_format() {
        assert_eq!(
            "1677-09-21 00:12:43.145224192",
            Timestamp::MIN_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
        assert_eq!(
            "2262-04-11 23:47:16.854775807",
            Timestamp::MAX_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
        assert_eq!(
            "-262143-01-01 00:00:00",
            Timestamp::MIN_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
        assert_eq!(
            "+262142-12-31 23:59:59.999999",
            Timestamp::MAX_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
        assert_eq!(
            "-262143-01-01 00:00:00",
            Timestamp::MIN_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
        assert_eq!(
            "+262142-12-31 23:59:59.999",
            Timestamp::MAX_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
        assert_eq!(
            "-262143-01-01 00:00:00",
            Timestamp::MIN_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
        assert_eq!(
            "+262142-12-31 23:59:59",
            Timestamp::MAX_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
        );
    }
 }
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -27,7 +27,7 @@ use common_error::ext::BoxedError;
 use common_error::status_code::StatusCode;
 use common_query::logical_plan::Expr;
 use common_query::physical_plan::DfPhysicalPlanAdapter;
-use common_query::{DfPhysicalPlan, Output};
+use common_query::{DfPhysicalPlan, OutputData};
 use common_recordbatch::SendableRecordBatchStream;
 use common_runtime::Runtime;
 use common_telemetry::tracing::{self, info_span};
@@ -651,11 +651,11 @@ impl RegionServerInner {
            .await
            .context(ExecuteLogicalPlanSnafu)?;
-        match result {
+        match result.data {
-            Output::AffectedRows(_) | Output::RecordBatches(_) => {
+            OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => {
                UnsupportedOutputSnafu { expected: "stream" }.fail()
            }
-            Output::Stream(stream, _) => Ok(stream),
+            OutputData::Stream(stream) => Ok(stream),
        }
    }
--- a/src/datatypes/src/value.rs
+++ b/src/datatypes/src/value.rs
@@ -370,6 +370,36 @@ impl Value {
    }
 }
 pub trait TryAsPrimitive<T: LogicalPrimitiveType> {
    fn try_as_primitive(&self) -> Option<T::Native>;
 }
 macro_rules! impl_try_as_primitive {
    ($Type: ident, $Variant: ident) => {
        impl TryAsPrimitive<crate::types::$Type> for Value {
            fn try_as_primitive(
                &self,
            ) -> Option<<crate::types::$Type as crate::types::LogicalPrimitiveType>::Native> {
                match self {
                    Value::$Variant(v) => Some((*v).into()),
                    _ => None,
                }
            }
        }
    };
 }
 impl_try_as_primitive!(Int8Type, Int8);
 impl_try_as_primitive!(Int16Type, Int16);
 impl_try_as_primitive!(Int32Type, Int32);
 impl_try_as_primitive!(Int64Type, Int64);
 impl_try_as_primitive!(UInt8Type, UInt8);
 impl_try_as_primitive!(UInt16Type, UInt16);
 impl_try_as_primitive!(UInt32Type, UInt32);
 impl_try_as_primitive!(UInt64Type, UInt64);
 impl_try_as_primitive!(Float32Type, Float32);
 impl_try_as_primitive!(Float64Type, Float64);
 pub fn to_null_scalar_value(output_type: &ConcreteDataType) -> Result<ScalarValue> {
    Ok(match output_type {
        ConcreteDataType::Null(_) => ScalarValue::Null,
@@ -2387,4 +2417,12 @@ mod tests {
        );
        check_value_ref_size_eq(&ValueRef::Decimal128(Decimal128::new(1234, 3, 1)), 32)
    }
    #[test]
    fn test_incorrect_default_value_issue_3479() {
        let value = OrderedF64::from(0.047318541668048164);
        let serialized = serde_json::to_string(&value).unwrap();
        let deserialized: OrderedF64 = serde_json::from_str(&serialized).unwrap();
        assert_eq!(value, deserialized);
    }
 }
--- a/src/file-engine/src/query.rs
+++ b/src/file-engine/src/query.rs
@@ -22,8 +22,9 @@ use std::task::{Context, Poll};
 use common_datasource::object_store::build_backend;
 use common_error::ext::BoxedError;
 use common_query::prelude::Expr;
 use common_recordbatch::adapter::RecordBatchMetrics;
 use common_recordbatch::error::{CastVectorSnafu, ExternalSnafu, Result as RecordBatchResult};
-use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
+use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
 use datafusion::logical_expr::utils as df_logical_expr_utils;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
@@ -151,6 +152,14 @@ impl RecordBatchStream for FileToScanRegionStream {
    fn schema(&self) -> SchemaRef {
        self.scan_schema.clone()
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        None
    }
    fn metrics(&self) -> Option<RecordBatchMetrics> {
        None
    }
 }
 impl Stream for FileToScanRegionStream {
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -18,6 +18,7 @@ common-query.workspace = true
 common-telemetry.workspace = true
 common-time.workspace = true
 datatypes.workspace = true
 enum_dispatch = "0.3"
 hydroflow = "0.5.0"
 itertools.workspace = true
 num-traits = "0.2"
@@ -27,3 +28,6 @@ session.workspace = true
 snafu.workspace = true
 tokio.workspace = true
 tonic.workspace = true
 [dev-dependencies]
 serde_json = "1.0"
--- a/src/flow/src/expr.rs
+++ b/src/flow/src/expr.rs
@@ -24,5 +24,6 @@ mod scalar;
 pub(crate) use error::{EvalError, InvalidArgumentSnafu, OptimizeSnafu};
 pub(crate) use func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
 pub(crate) use id::{GlobalId, Id, LocalId};
 pub(crate) use linear::{MapFilterProject, MfpPlan, SafeMfpPlan};
 pub(crate) use relation::{AggregateExpr, AggregateFunc};
 pub(crate) use scalar::ScalarExpr;
--- a/src/flow/src/expr/error.rs
+++ b/src/flow/src/expr/error.rs
@@ -61,4 +61,7 @@ pub enum EvalError {
    #[snafu(display("Unsupported temporal filter: {reason}"))]
    UnsupportedTemporalFilter { reason: String, location: Location },
    #[snafu(display("Overflowed during evaluation"))]
    Overflow { location: Location },
 }
--- a/src/flow/src/expr/linear.rs
+++ b/src/flow/src/expr/linear.rs
@@ -45,7 +45,7 @@ use crate::repr::{self, value_to_internal_ts, Diff, Row};
 /// expressions in `self.expressions`, even though this is not something
 /// we can directly evaluate. The plan creation methods will defensively
 /// ensure that the right thing happens.
-#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct MapFilterProject {
    /// A sequence of expressions that should be appended to the row.
    ///
@@ -415,7 +415,7 @@ impl MapFilterProject {
 }
 /// A wrapper type which indicates it is safe to simply evaluate all expressions.
-#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
 pub struct SafeMfpPlan {
    pub(crate) mfp: MapFilterProject,
 }
@@ -800,7 +800,7 @@ mod test {
            .unwrap();
        // only retain sum result
        let mfp = mfp.project(vec![4]).unwrap();
-        // accept only if if the sum is greater than 10
+        // accept only if the sum is greater than 10
        let mfp = mfp
            .filter(vec![ScalarExpr::Column(0).call_binary(
                ScalarExpr::Literal(Value::from(10i32), ConcreteDataType::int32_datatype()),
--- a/src/flow/src/expr/relation.rs
+++ b/src/flow/src/expr/relation.rs
@@ -21,7 +21,7 @@ mod accum;
 mod func;
 /// Describes an aggregation expression.
-#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct AggregateExpr {
    /// Names the aggregation function.
    pub func: AggregateFunc,
--- a/src/flow/src/expr/relation/accum.rs
+++ b/src/flow/src/expr/relation/accum.rs
@@ -14,7 +14,10 @@
 //! Accumulators for aggregate functions that's is accumulatable. i.e. sum/count
 //!
-//! Currently support sum, count, any, all
+//! Accumulator will only be restore from row and being updated every time dataflow need process a new batch of rows.
 //! So the overhead is acceptable.
 //!
 //! Currently support sum, count, any, all and min/max(with one caveat that min/max can't support delete with aggregate).
 use std::fmt::Display;
@@ -22,13 +25,506 @@ use common_decimal::Decimal128;
 use common_time::{Date, DateTime};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::value::{OrderedF32, OrderedF64, OrderedFloat, Value};
 use enum_dispatch::enum_dispatch;
 use hydroflow::futures::stream::Concat;
 use serde::{Deserialize, Serialize};
 use snafu::ensure;
-use crate::expr::error::{InternalSnafu, TryFromValueSnafu, TypeMismatchSnafu};
+use crate::expr::error::{InternalSnafu, OverflowSnafu, TryFromValueSnafu, TypeMismatchSnafu};
 use crate::expr::relation::func::GenericFn;
 use crate::expr::{AggregateFunc, EvalError};
 use crate::repr::Diff;
 /// Accumulates values for the various types of accumulable aggregations.
 #[enum_dispatch]
 pub trait Accumulator: Sized {
    fn into_state(self) -> Vec<Value>;
    fn update(
        &mut self,
        aggr_fn: &AggregateFunc,
        value: Value,
        diff: Diff,
    ) -> Result<(), EvalError>;
    fn update_batch<I>(&mut self, aggr_fn: &AggregateFunc, value_diffs: I) -> Result<(), EvalError>
    where
        I: IntoIterator<Item = (Value, Diff)>,
    {
        for (v, d) in value_diffs {
            self.update(aggr_fn, v, d)?;
        }
        Ok(())
    }
    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError>;
 }
 /// Bool accumulator, used for `Any` `All` `Max/MinBool`
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub struct Bool {
    /// The number of `true` values observed.
    trues: Diff,
    /// The number of `false` values observed.
    falses: Diff,
 }
 impl TryFrom<Vec<Value>> for Bool {
    type Error = EvalError;
    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
        ensure!(
            state.len() == 2,
            InternalSnafu {
                reason: "Bool Accumulator state should have 2 values",
            }
        );
        let mut iter = state.into_iter();
        Ok(Self {
            trues: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
            falses: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
        })
    }
 }
 impl Accumulator for Bool {
    fn into_state(self) -> Vec<Value> {
        vec![self.trues.into(), self.falses.into()]
    }
    /// Null values are ignored
    fn update(
        &mut self,
        aggr_fn: &AggregateFunc,
        value: Value,
        diff: Diff,
    ) -> Result<(), EvalError> {
        ensure!(
            matches!(
                aggr_fn,
                AggregateFunc::Any
                    | AggregateFunc::All
                    | AggregateFunc::MaxBool
                    | AggregateFunc::MinBool
            ),
            InternalSnafu {
                reason: format!(
                    "Bool Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
        );
        match value {
            Value::Boolean(true) => self.trues += diff,
            Value::Boolean(false) => self.falses += diff,
            Value::Null => (), // ignore nulls
            x => {
                return Err(TypeMismatchSnafu {
                    expected: ConcreteDataType::boolean_datatype(),
                    actual: x.data_type(),
                }
                .build());
            }
        };
        Ok(())
    }
    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
        match aggr_fn {
            AggregateFunc::Any => Ok(Value::from(self.trues > 0)),
            AggregateFunc::All => Ok(Value::from(self.falses == 0)),
            AggregateFunc::MaxBool => Ok(Value::from(self.trues > 0)),
            AggregateFunc::MinBool => Ok(Value::from(self.falses == 0)),
            _ => Err(InternalSnafu {
                reason: format!(
                    "Bool Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
            .build()),
        }
    }
 }
 /// Accumulates simple numeric values for sum over integer.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub struct SimpleNumber {
    /// The accumulation of all non-NULL values observed.
    accum: i128,
    /// The number of non-NULL values observed.
    non_nulls: Diff,
 }
 impl TryFrom<Vec<Value>> for SimpleNumber {
    type Error = EvalError;
    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
        ensure!(
            state.len() == 2,
            InternalSnafu {
                reason: "Number Accumulator state should have 2 values",
            }
        );
        let mut iter = state.into_iter();
        Ok(Self {
            accum: Decimal128::try_from(iter.next().unwrap())
                .map_err(err_try_from_val)?
                .val(),
            non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
        })
    }
 }
 impl Accumulator for SimpleNumber {
    fn into_state(self) -> Vec<Value> {
        vec![
            Value::Decimal128(Decimal128::new(self.accum, 38, 0)),
            self.non_nulls.into(),
        ]
    }
    fn update(
        &mut self,
        aggr_fn: &AggregateFunc,
        value: Value,
        diff: Diff,
    ) -> Result<(), EvalError> {
        ensure!(
            matches!(
                aggr_fn,
                AggregateFunc::SumInt16
                    | AggregateFunc::SumInt32
                    | AggregateFunc::SumInt64
                    | AggregateFunc::SumUInt16
                    | AggregateFunc::SumUInt32
                    | AggregateFunc::SumUInt64
            ),
            InternalSnafu {
                reason: format!(
                    "SimpleNumber Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
        );
        let v = match (aggr_fn, value) {
            (AggregateFunc::SumInt16, Value::Int16(x)) => i128::from(x),
            (AggregateFunc::SumInt32, Value::Int32(x)) => i128::from(x),
            (AggregateFunc::SumInt64, Value::Int64(x)) => i128::from(x),
            (AggregateFunc::SumUInt16, Value::UInt16(x)) => i128::from(x),
            (AggregateFunc::SumUInt32, Value::UInt32(x)) => i128::from(x),
            (AggregateFunc::SumUInt64, Value::UInt64(x)) => i128::from(x),
            (_f, Value::Null) => return Ok(()), // ignore null
            (f, v) => {
                let expected_datatype = f.signature().input;
                return Err(TypeMismatchSnafu {
                    expected: expected_datatype,
                    actual: v.data_type(),
                }
                .build())?;
            }
        };
        self.accum += v * i128::from(diff);
        self.non_nulls += diff;
        Ok(())
    }
    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
        match aggr_fn {
            AggregateFunc::SumInt16 | AggregateFunc::SumInt32 | AggregateFunc::SumInt64 => {
                i64::try_from(self.accum)
                    .map_err(|_e| OverflowSnafu {}.build())
                    .map(Value::from)
            }
            AggregateFunc::SumUInt16 | AggregateFunc::SumUInt32 | AggregateFunc::SumUInt64 => {
                u64::try_from(self.accum)
                    .map_err(|_e| OverflowSnafu {}.build())
                    .map(Value::from)
            }
            _ => Err(InternalSnafu {
                reason: format!(
                    "SimpleNumber Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
            .build()),
        }
    }
 }
 /// Accumulates float values for sum over floating numbers.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub struct Float {
    /// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
    /// accum will be set to zero if `non_nulls` is zero.
    accum: OrderedF64,
    /// Counts +inf
    pos_infs: Diff,
    /// Counts -inf
    neg_infs: Diff,
    /// Counts NaNs
    nans: Diff,
    /// Counts non-NULL values
    non_nulls: Diff,
 }
 impl TryFrom<Vec<Value>> for Float {
    type Error = EvalError;
    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
        ensure!(
            state.len() == 5,
            InternalSnafu {
                reason: "Float Accumulator state should have 5 values",
            }
        );
        let mut iter = state.into_iter();
        let mut ret = Self {
            accum: OrderedF64::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
            pos_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
            neg_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
            nans: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
            non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
        };
        // This prevent counter-intuitive behavior of summing over no values
        if ret.non_nulls == 0 {
            ret.accum = OrderedFloat::from(0.0);
        }
        Ok(ret)
    }
 }
 impl Accumulator for Float {
    fn into_state(self) -> Vec<Value> {
        vec![
            self.accum.into(),
            self.pos_infs.into(),
            self.neg_infs.into(),
            self.nans.into(),
            self.non_nulls.into(),
        ]
    }
    /// sum ignore null
    fn update(
        &mut self,
        aggr_fn: &AggregateFunc,
        value: Value,
        diff: Diff,
    ) -> Result<(), EvalError> {
        ensure!(
            matches!(
                aggr_fn,
                AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64
            ),
            InternalSnafu {
                reason: format!(
                    "Float Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
        );
        let x = match (aggr_fn, value) {
            (AggregateFunc::SumFloat32, Value::Float32(x)) => OrderedF64::from(*x as f64),
            (AggregateFunc::SumFloat64, Value::Float64(x)) => OrderedF64::from(x),
            (_f, Value::Null) => return Ok(()), // ignore null
            (f, v) => {
                let expected_datatype = f.signature().input;
                return Err(TypeMismatchSnafu {
                    expected: expected_datatype,
                    actual: v.data_type(),
                }
                .build())?;
            }
        };
        if x.is_nan() {
            self.nans += diff;
        } else if x.is_infinite() {
            if x.is_sign_positive() {
                self.pos_infs += diff;
            } else {
                self.neg_infs += diff;
            }
        } else {
            self.accum += *(x * OrderedF64::from(diff as f64));
        }
        self.non_nulls += diff;
        Ok(())
    }
    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
        match aggr_fn {
            AggregateFunc::SumFloat32 => Ok(Value::Float32(OrderedF32::from(self.accum.0 as f32))),
            AggregateFunc::SumFloat64 => Ok(Value::Float64(self.accum)),
            _ => Err(InternalSnafu {
                reason: format!(
                    "Float Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
            .build()),
        }
    }
 }
 /// Accumulates a single `Ord`ed `Value`, useful for min/max aggregations.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub struct OrdValue {
    val: Option<Value>,
    non_nulls: Diff,
 }
 impl TryFrom<Vec<Value>> for OrdValue {
    type Error = EvalError;
    fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
        ensure!(
            state.len() == 2,
            InternalSnafu {
                reason: "OrdValue Accumulator state should have 2 values",
            }
        );
        let mut iter = state.into_iter();
        Ok(Self {
            val: {
                let v = iter.next().unwrap();
                if v == Value::Null {
                    None
                } else {
                    Some(v)
                }
            },
            non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
        })
    }
 }
 impl Accumulator for OrdValue {
    fn into_state(self) -> Vec<Value> {
        vec![self.val.unwrap_or(Value::Null), self.non_nulls.into()]
    }
    /// min/max try to find results in all non-null values, if all values are null, the result is null.
    /// count(col_name) gives the number of non-null values, count(*) gives the number of rows including nulls.
    /// TODO(discord9): add count(*) as a aggr function
    fn update(
        &mut self,
        aggr_fn: &AggregateFunc,
        value: Value,
        diff: Diff,
    ) -> Result<(), EvalError> {
        ensure!(
            aggr_fn.is_max() || aggr_fn.is_min() || matches!(aggr_fn, AggregateFunc::Count),
            InternalSnafu {
                reason: format!(
                    "OrdValue Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
        );
        if diff <= 0 && (aggr_fn.is_max() || aggr_fn.is_min()) {
            return Err(InternalSnafu {
                reason: "OrdValue Accumulator does not support non-monotonic input for min/max aggregation".to_string(),
            }.build());
        }
        // if aggr_fn is count, the incoming value type doesn't matter in type checking
        // otherwise, type need to be the same or value can be null
        let check_type_aggr_fn_and_arg_value =
            ty_eq_without_precision(value.data_type(), aggr_fn.signature().input)
                || matches!(aggr_fn, AggregateFunc::Count)
                || value.is_null();
        let check_type_aggr_fn_and_self_val = self
            .val
            .as_ref()
            .map(|zelf| ty_eq_without_precision(zelf.data_type(), aggr_fn.signature().input))
            .unwrap_or(true)
            || matches!(aggr_fn, AggregateFunc::Count);
        if !check_type_aggr_fn_and_arg_value {
            return Err(TypeMismatchSnafu {
                expected: aggr_fn.signature().input,
                actual: value.data_type(),
            }
            .build());
        } else if !check_type_aggr_fn_and_self_val {
            return Err(TypeMismatchSnafu {
                expected: aggr_fn.signature().input,
                actual: self
                    .val
                    .as_ref()
                    .map(|v| v.data_type())
                    .unwrap_or(ConcreteDataType::null_datatype()),
            }
            .build());
        }
        let is_null = value.is_null();
        if is_null {
            return Ok(());
        }
        if !is_null {
            // compile count(*) to count(true) to include null/non-nulls
            // And the counts of non-null values are updated here
            self.non_nulls += diff;
            match aggr_fn.signature().generic_fn {
                GenericFn::Max => {
                    self.val = self
                        .val
                        .clone()
                        .map(|v| v.max(value.clone()))
                        .or_else(|| Some(value))
                }
                GenericFn::Min => {
                    self.val = self
                        .val
                        .clone()
                        .map(|v| v.min(value.clone()))
                        .or_else(|| Some(value))
                }
                GenericFn::Count => (),
                _ => unreachable!("already checked by ensure!"),
            }
        };
        // min/max ignore nulls
        Ok(())
    }
    fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
        if aggr_fn.is_max() || aggr_fn.is_min() {
            Ok(self.val.clone().unwrap_or(Value::Null))
        } else if matches!(aggr_fn, AggregateFunc::Count) {
            Ok(self.non_nulls.into())
        } else {
            Err(InternalSnafu {
                reason: format!(
                    "OrdValue Accumulator does not support this aggregation function: {:?}",
                    aggr_fn
                ),
            }
            .build())
        }
    }
 }
 /// Accumulates values for the various types of accumulable aggregations.
 ///
 /// We assume that there are not more than 2^32 elements for the aggregation.
@@ -38,34 +534,407 @@ use crate::repr::Diff;
 /// The float accumulator performs accumulation with tolerance for floating point error.
 ///
 /// TODO(discord9): check for overflowing
 #[enum_dispatch(Accumulator)]
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub enum Accum {
    /// Accumulates boolean values.
-    Bool {
+    Bool(Bool),
        /// The number of `true` values observed.
        trues: Diff,
        /// The number of `false` values observed.
        falses: Diff,
    },
    /// Accumulates simple numeric values.
-    SimpleNumber {
+    SimpleNumber(SimpleNumber),
        /// The accumulation of all non-NULL values observed.
        accum: i128,
        /// The number of non-NULL values observed.
        non_nulls: Diff,
    },
    /// Accumulates float values.
-    Float {
+    Float(Float),
-        /// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
+    /// Accumulate Values that impl `Ord`
-        /// accum will be set to zero if `non_nulls` is zero.
+    OrdValue(OrdValue),
-        accum: OrderedF64,
+}
-        /// Counts +inf
+
-        pos_infs: Diff,
+impl Accum {
-        /// Counts -inf
+    pub fn new_accum(aggr_fn: &AggregateFunc) -> Result<Self, EvalError> {
-        neg_infs: Diff,
+        Ok(match aggr_fn {
-        /// Counts NaNs
+            AggregateFunc::Any
-        nans: Diff,
+            | AggregateFunc::All
-        /// Counts non-NULL values
+            | AggregateFunc::MaxBool
-        non_nulls: Diff,
+            | AggregateFunc::MinBool => Self::from(Bool {
-    },
+                trues: 0,
                falses: 0,
            }),
            AggregateFunc::SumInt16
            | AggregateFunc::SumInt32
            | AggregateFunc::SumInt64
            | AggregateFunc::SumUInt16
            | AggregateFunc::SumUInt32
            | AggregateFunc::SumUInt64 => Self::from(SimpleNumber {
                accum: 0,
                non_nulls: 0,
            }),
            AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => Self::from(Float {
                accum: OrderedF64::from(0.0),
                pos_infs: 0,
                neg_infs: 0,
                nans: 0,
                non_nulls: 0,
            }),
            f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
                Self::from(OrdValue {
                    val: None,
                    non_nulls: 0,
                })
            }
            f => {
                return Err(InternalSnafu {
                    reason: format!(
                        "Accumulator does not support this aggregation function: {:?}",
                        f
                    ),
                }
                .build());
            }
        })
    }
    pub fn try_into_accum(aggr_fn: &AggregateFunc, state: Vec<Value>) -> Result<Self, EvalError> {
        match aggr_fn {
            AggregateFunc::Any
            | AggregateFunc::All
            | AggregateFunc::MaxBool
            | AggregateFunc::MinBool => Ok(Self::from(Bool::try_from(state)?)),
            AggregateFunc::SumInt16
            | AggregateFunc::SumInt32
            | AggregateFunc::SumInt64
            | AggregateFunc::SumUInt16
            | AggregateFunc::SumUInt32
            | AggregateFunc::SumUInt64 => Ok(Self::from(SimpleNumber::try_from(state)?)),
            AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => {
                Ok(Self::from(Float::try_from(state)?))
            }
            f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
                Ok(Self::from(OrdValue::try_from(state)?))
            }
            f => Err(InternalSnafu {
                reason: format!(
                    "Accumulator does not support this aggregation function: {:?}",
                    f
                ),
            }
            .build()),
        }
    }
 }
 fn err_try_from_val<T: Display>(reason: T) -> EvalError {
    TryFromValueSnafu {
        msg: reason.to_string(),
    }
    .build()
 }
 /// compare type while ignore their precision, including `TimeStamp`, `Time`,
 /// `Duration`, `Interval`
 fn ty_eq_without_precision(left: ConcreteDataType, right: ConcreteDataType) -> bool {
    left == right
        || matches!(left, ConcreteDataType::Timestamp(..))
            && matches!(right, ConcreteDataType::Timestamp(..))
        || matches!(left, ConcreteDataType::Time(..)) && matches!(right, ConcreteDataType::Time(..))
        || matches!(left, ConcreteDataType::Duration(..))
            && matches!(right, ConcreteDataType::Duration(..))
        || matches!(left, ConcreteDataType::Interval(..))
            && matches!(right, ConcreteDataType::Interval(..))
 }
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn test_accum() {
        let testcases = vec![
            (
                AggregateFunc::SumInt32,
                vec![(Value::Int32(1), 1), (Value::Null, 1)],
                (
                    Value::Int64(1),
                    vec![Value::Decimal128(Decimal128::new(1, 38, 0)), 1i64.into()],
                ),
            ),
            (
                AggregateFunc::SumFloat32,
                vec![(Value::Float32(OrderedF32::from(1.0)), 1), (Value::Null, 1)],
                (
                    Value::Float32(OrderedF32::from(1.0)),
                    vec![
                        Value::Float64(OrderedF64::from(1.0)),
                        0i64.into(),
                        0i64.into(),
                        0i64.into(),
                        1i64.into(),
                    ],
                ),
            ),
            (
                AggregateFunc::MaxInt32,
                vec![(Value::Int32(1), 1), (Value::Int32(2), 1), (Value::Null, 1)],
                (Value::Int32(2), vec![Value::Int32(2), 2i64.into()]),
            ),
            (
                AggregateFunc::MinInt32,
                vec![(Value::Int32(2), 1), (Value::Int32(1), 1), (Value::Null, 1)],
                (Value::Int32(1), vec![Value::Int32(1), 2i64.into()]),
            ),
            (
                AggregateFunc::MaxFloat32,
                vec![
                    (Value::Float32(OrderedF32::from(1.0)), 1),
                    (Value::Float32(OrderedF32::from(2.0)), 1),
                    (Value::Null, 1),
                ],
                (
                    Value::Float32(OrderedF32::from(2.0)),
                    vec![Value::Float32(OrderedF32::from(2.0)), 2i64.into()],
                ),
            ),
            (
                AggregateFunc::MaxDateTime,
                vec![
                    (Value::DateTime(DateTime::from(0)), 1),
                    (Value::DateTime(DateTime::from(1)), 1),
                    (Value::Null, 1),
                ],
                (
                    Value::DateTime(DateTime::from(1)),
                    vec![Value::DateTime(DateTime::from(1)), 2i64.into()],
                ),
            ),
            (
                AggregateFunc::Count,
                vec![
                    (Value::Int32(1), 1),
                    (Value::Int32(2), 1),
                    (Value::Null, 1),
                    (Value::Null, 1),
                ],
                (2i64.into(), vec![Value::Null, 2i64.into()]),
            ),
            (
                AggregateFunc::Any,
                vec![
                    (Value::Boolean(false), 1),
                    (Value::Boolean(false), 1),
                    (Value::Boolean(true), 1),
                    (Value::Null, 1),
                ],
                (
                    Value::Boolean(true),
                    vec![Value::from(1i64), Value::from(2i64)],
                ),
            ),
            (
                AggregateFunc::All,
                vec![
                    (Value::Boolean(false), 1),
                    (Value::Boolean(false), 1),
                    (Value::Boolean(true), 1),
                    (Value::Null, 1),
                ],
                (
                    Value::Boolean(false),
                    vec![Value::from(1i64), Value::from(2i64)],
                ),
            ),
            (
                AggregateFunc::MaxBool,
                vec![
                    (Value::Boolean(false), 1),
                    (Value::Boolean(false), 1),
                    (Value::Boolean(true), 1),
                    (Value::Null, 1),
                ],
                (
                    Value::Boolean(true),
                    vec![Value::from(1i64), Value::from(2i64)],
                ),
            ),
            (
                AggregateFunc::MinBool,
                vec![
                    (Value::Boolean(false), 1),
                    (Value::Boolean(false), 1),
                    (Value::Boolean(true), 1),
                    (Value::Null, 1),
                ],
                (
                    Value::Boolean(false),
                    vec![Value::from(1i64), Value::from(2i64)],
                ),
            ),
        ];
        for (aggr_fn, input, (eval_res, state)) in testcases {
            let create_and_insert = || -> Result<Accum, EvalError> {
                let mut acc = Accum::new_accum(&aggr_fn)?;
                acc.update_batch(&aggr_fn, input.clone())?;
                let row = acc.into_state();
                let acc = Accum::try_into_accum(&aggr_fn, row)?;
                Ok(acc)
            };
            let acc = match create_and_insert() {
                Ok(acc) => acc,
                Err(err) => panic!(
                    "Failed to create accum for {:?} with input {:?} with error: {:?}",
                    aggr_fn, input, err
                ),
            };
            if acc.eval(&aggr_fn).unwrap() != eval_res {
                panic!(
                    "Failed to eval accum for {:?} with input {:?}, expect {:?}, got {:?}",
                    aggr_fn,
                    input,
                    eval_res,
                    acc.eval(&aggr_fn).unwrap()
                );
            }
            let actual_state = acc.into_state();
            if actual_state != state {
                panic!(
                    "Failed to cast into state from accum for {:?} with input {:?}, expect state {:?}, got state {:?}",
                    aggr_fn,
                    input,
                    state,
                    actual_state
                );
            }
        }
    }
    #[test]
    fn test_fail_path_accum() {
        {
            let bool_accum = Bool::try_from(vec![Value::Null]);
            assert!(matches!(bool_accum, Err(EvalError::Internal { .. })));
        }
        {
            let mut bool_accum = Bool::try_from(vec![1i64.into(), 1i64.into()]).unwrap();
            // serde
            let bool_accum_serde = serde_json::to_string(&bool_accum).unwrap();
            let bool_accum_de = serde_json::from_str::<Bool>(&bool_accum_serde).unwrap();
            assert_eq!(bool_accum, bool_accum_de);
            assert!(matches!(
                bool_accum.update(&AggregateFunc::MaxDate, 1.into(), 1),
                Err(EvalError::Internal { .. })
            ));
            assert!(matches!(
                bool_accum.update(&AggregateFunc::Any, 1.into(), 1),
                Err(EvalError::TypeMismatch { .. })
            ));
            assert!(matches!(
                bool_accum.eval(&AggregateFunc::MaxDate),
                Err(EvalError::Internal { .. })
            ));
        }
        {
            let ret = SimpleNumber::try_from(vec![Value::Null]);
            assert!(matches!(ret, Err(EvalError::Internal { .. })));
            let mut accum =
                SimpleNumber::try_from(vec![Decimal128::new(0, 38, 0).into(), 0i64.into()])
                    .unwrap();
            assert!(matches!(
                accum.update(&AggregateFunc::All, 0.into(), 1),
                Err(EvalError::Internal { .. })
            ));
            assert!(matches!(
                accum.update(&AggregateFunc::SumInt64, 0i32.into(), 1),
                Err(EvalError::TypeMismatch { .. })
            ));
            assert!(matches!(
                accum.eval(&AggregateFunc::All),
                Err(EvalError::Internal { .. })
            ));
            accum
                .update(&AggregateFunc::SumInt64, 1i64.into(), 1)
                .unwrap();
            accum
                .update(&AggregateFunc::SumInt64, i64::MAX.into(), 1)
                .unwrap();
            assert!(matches!(
                accum.eval(&AggregateFunc::SumInt64),
                Err(EvalError::Overflow { .. })
            ));
        }
        {
            let ret = Float::try_from(vec![2f64.into(), 0i64.into(), 0i64.into(), 0i64.into()]);
            assert!(matches!(ret, Err(EvalError::Internal { .. })));
            let mut accum = Float::try_from(vec![
                2f64.into(),
                0i64.into(),
                0i64.into(),
                0i64.into(),
                1i64.into(),
            ])
            .unwrap();
            accum
                .update(&AggregateFunc::SumFloat64, 2f64.into(), -1)
                .unwrap();
            assert!(matches!(
                accum.update(&AggregateFunc::All, 0.into(), 1),
                Err(EvalError::Internal { .. })
            ));
            assert!(matches!(
                accum.update(&AggregateFunc::SumFloat64, 0.0f32.into(), 1),
                Err(EvalError::TypeMismatch { .. })
            ));
            // no record, no accum
            assert_eq!(
                accum.eval(&AggregateFunc::SumFloat64).unwrap(),
                0.0f64.into()
            );
            assert!(matches!(
                accum.eval(&AggregateFunc::All),
                Err(EvalError::Internal { .. })
            ));
            accum
                .update(&AggregateFunc::SumFloat64, f64::INFINITY.into(), 1)
                .unwrap();
            accum
                .update(&AggregateFunc::SumFloat64, (-f64::INFINITY).into(), 1)
                .unwrap();
            accum
                .update(&AggregateFunc::SumFloat64, f64::NAN.into(), 1)
                .unwrap();
        }
        {
            let ret = OrdValue::try_from(vec![Value::Null]);
            assert!(matches!(ret, Err(EvalError::Internal { .. })));
            let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
            assert!(matches!(
                accum.update(&AggregateFunc::All, 0.into(), 1),
                Err(EvalError::Internal { .. })
            ));
            accum
                .update(&AggregateFunc::MaxInt16, 1i16.into(), 1)
                .unwrap();
            assert!(matches!(
                accum.update(&AggregateFunc::MaxInt16, 0i32.into(), 1),
                Err(EvalError::TypeMismatch { .. })
            ));
            assert!(matches!(
                accum.update(&AggregateFunc::MaxInt16, 0i16.into(), -1),
                Err(EvalError::Internal { .. })
            ));
            accum
                .update(&AggregateFunc::MaxInt16, Value::Null, 1)
                .unwrap();
        }
        // insert uint64 into max_int64 should fail
        {
            let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
            assert!(matches!(
                accum.update(&AggregateFunc::MaxInt64, 0u64.into(), 1),
                Err(EvalError::TypeMismatch { .. })
            ));
        }
    }
 }
--- a/src/flow/src/expr/relation/func.rs
+++ b/src/flow/src/expr/relation/func.rs
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::any::type_name;
 use common_time::{Date, DateTime};
 use datatypes::prelude::ConcreteDataType;
 use datatypes::value::{OrderedF32, OrderedF64, Value};
 use serde::{Deserialize, Serialize};
 use crate::expr::error::{EvalError, TryFromValueSnafu, TypeMismatchSnafu};
-use crate::expr::relation::accum::Accum;
+use crate::expr::relation::accum::{Accum, Accumulator};
 use crate::repr::Diff;
 /// Aggregate functions that can be applied to a group of rows.
@@ -83,3 +81,280 @@ pub enum AggregateFunc {
    Any,
    All,
 }
 impl AggregateFunc {
    pub fn is_max(&self) -> bool {
        self.signature().generic_fn == GenericFn::Max
    }
    pub fn is_min(&self) -> bool {
        self.signature().generic_fn == GenericFn::Min
    }
    pub fn is_sum(&self) -> bool {
        self.signature().generic_fn == GenericFn::Sum
    }
    /// Eval value, diff with accumulator
    ///
    /// Expect self to be accumulable aggregate functio, i.e. sum/count
    ///
    /// TODO(discord9): deal with overflow&better accumulator
    pub fn eval_diff_accumulable<I>(
        &self,
        accum: Vec<Value>,
        value_diffs: I,
    ) -> Result<(Value, Vec<Value>), EvalError>
    where
        I: IntoIterator<Item = (Value, Diff)>,
    {
        let mut accum = if accum.is_empty() {
            Accum::new_accum(self)?
        } else {
            Accum::try_into_accum(self, accum)?
        };
        accum.update_batch(self, value_diffs)?;
        let res = accum.eval(self)?;
        Ok((res, accum.into_state()))
    }
 }
 pub struct Signature {
    pub input: ConcreteDataType,
    pub output: ConcreteDataType,
    pub generic_fn: GenericFn,
 }
 #[derive(Debug, PartialEq, Eq)]
 pub enum GenericFn {
    Max,
    Min,
    Sum,
    Count,
    Any,
    All,
 }
 impl AggregateFunc {
    /// all concrete datatypes with precision types will be returned with largest possible variant
    /// as a exception, count have a signature of `null -> i64`, but it's actually `anytype -> i64`
    pub fn signature(&self) -> Signature {
        match self {
            AggregateFunc::MaxInt16 => Signature {
                input: ConcreteDataType::int16_datatype(),
                output: ConcreteDataType::int16_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxInt32 => Signature {
                input: ConcreteDataType::int32_datatype(),
                output: ConcreteDataType::int32_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxInt64 => Signature {
                input: ConcreteDataType::int64_datatype(),
                output: ConcreteDataType::int64_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxUInt16 => Signature {
                input: ConcreteDataType::uint16_datatype(),
                output: ConcreteDataType::uint16_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxUInt32 => Signature {
                input: ConcreteDataType::uint32_datatype(),
                output: ConcreteDataType::uint32_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxUInt64 => Signature {
                input: ConcreteDataType::uint64_datatype(),
                output: ConcreteDataType::uint64_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxFloat32 => Signature {
                input: ConcreteDataType::float32_datatype(),
                output: ConcreteDataType::float32_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxFloat64 => Signature {
                input: ConcreteDataType::float64_datatype(),
                output: ConcreteDataType::float64_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxBool => Signature {
                input: ConcreteDataType::boolean_datatype(),
                output: ConcreteDataType::boolean_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxString => Signature {
                input: ConcreteDataType::string_datatype(),
                output: ConcreteDataType::string_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxDate => Signature {
                input: ConcreteDataType::date_datatype(),
                output: ConcreteDataType::date_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxDateTime => Signature {
                input: ConcreteDataType::datetime_datatype(),
                output: ConcreteDataType::datetime_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxTimestamp => Signature {
                input: ConcreteDataType::timestamp_second_datatype(),
                output: ConcreteDataType::timestamp_second_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxTime => Signature {
                input: ConcreteDataType::time_second_datatype(),
                output: ConcreteDataType::time_second_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxDuration => Signature {
                input: ConcreteDataType::duration_second_datatype(),
                output: ConcreteDataType::duration_second_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MaxInterval => Signature {
                input: ConcreteDataType::interval_year_month_datatype(),
                output: ConcreteDataType::interval_year_month_datatype(),
                generic_fn: GenericFn::Max,
            },
            AggregateFunc::MinInt16 => Signature {
                input: ConcreteDataType::int16_datatype(),
                output: ConcreteDataType::int16_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinInt32 => Signature {
                input: ConcreteDataType::int32_datatype(),
                output: ConcreteDataType::int32_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinInt64 => Signature {
                input: ConcreteDataType::int64_datatype(),
                output: ConcreteDataType::int64_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinUInt16 => Signature {
                input: ConcreteDataType::uint16_datatype(),
                output: ConcreteDataType::uint16_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinUInt32 => Signature {
                input: ConcreteDataType::uint32_datatype(),
                output: ConcreteDataType::uint32_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinUInt64 => Signature {
                input: ConcreteDataType::uint64_datatype(),
                output: ConcreteDataType::uint64_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinFloat32 => Signature {
                input: ConcreteDataType::float32_datatype(),
                output: ConcreteDataType::float32_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinFloat64 => Signature {
                input: ConcreteDataType::float64_datatype(),
                output: ConcreteDataType::float64_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinBool => Signature {
                input: ConcreteDataType::boolean_datatype(),
                output: ConcreteDataType::boolean_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinString => Signature {
                input: ConcreteDataType::string_datatype(),
                output: ConcreteDataType::string_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinDate => Signature {
                input: ConcreteDataType::date_datatype(),
                output: ConcreteDataType::date_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinDateTime => Signature {
                input: ConcreteDataType::datetime_datatype(),
                output: ConcreteDataType::datetime_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinTimestamp => Signature {
                input: ConcreteDataType::timestamp_second_datatype(),
                output: ConcreteDataType::timestamp_second_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinTime => Signature {
                input: ConcreteDataType::time_second_datatype(),
                output: ConcreteDataType::time_second_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinDuration => Signature {
                input: ConcreteDataType::duration_second_datatype(),
                output: ConcreteDataType::duration_second_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::MinInterval => Signature {
                input: ConcreteDataType::interval_year_month_datatype(),
                output: ConcreteDataType::interval_year_month_datatype(),
                generic_fn: GenericFn::Min,
            },
            AggregateFunc::SumInt16 => Signature {
                input: ConcreteDataType::int16_datatype(),
                output: ConcreteDataType::int16_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::SumInt32 => Signature {
                input: ConcreteDataType::int32_datatype(),
                output: ConcreteDataType::int32_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::SumInt64 => Signature {
                input: ConcreteDataType::int64_datatype(),
                output: ConcreteDataType::int64_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::SumUInt16 => Signature {
                input: ConcreteDataType::uint16_datatype(),
                output: ConcreteDataType::uint16_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::SumUInt32 => Signature {
                input: ConcreteDataType::uint32_datatype(),
                output: ConcreteDataType::uint32_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::SumUInt64 => Signature {
                input: ConcreteDataType::uint64_datatype(),
                output: ConcreteDataType::uint64_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::SumFloat32 => Signature {
                input: ConcreteDataType::float32_datatype(),
                output: ConcreteDataType::float32_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::SumFloat64 => Signature {
                input: ConcreteDataType::float64_datatype(),
                output: ConcreteDataType::float64_datatype(),
                generic_fn: GenericFn::Sum,
            },
            AggregateFunc::Count => Signature {
                input: ConcreteDataType::null_datatype(),
                output: ConcreteDataType::int64_datatype(),
                generic_fn: GenericFn::Count,
            },
            AggregateFunc::Any => Signature {
                input: ConcreteDataType::boolean_datatype(),
                output: ConcreteDataType::boolean_datatype(),
                generic_fn: GenericFn::Any,
            },
            AggregateFunc::All => Signature {
                input: ConcreteDataType::boolean_datatype(),
                output: ConcreteDataType::boolean_datatype(),
                generic_fn: GenericFn::All,
            },
        }
    }
 }
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -17,4 +17,5 @@
 // allow unused for now because it should be use later
 mod adapter;
 mod expr;
 mod plan;
 mod repr;
--- a/src/flow/src/plan.rs
+++ b/src/flow/src/plan.rs
@@ -0,0 +1,98 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //! This module contain basic definition for dataflow's plan
 //! that can be translate to hydro dataflow
 mod join;
 mod reduce;
 use serde::{Deserialize, Serialize};
 pub(crate) use self::reduce::{AccumulablePlan, KeyValPlan, ReducePlan};
 use crate::expr::{
    AggregateExpr, EvalError, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
 };
 use crate::plan::join::JoinPlan;
 use crate::repr::{DiffRow, RelationType};
 #[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
 pub struct TypedPlan {
    /// output type of the relation
    pub typ: RelationType,
    pub plan: Plan,
 }
 /// TODO(discord9): support `TableFunc`（by define FlatMap that map 1 to n)
 #[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
 pub enum Plan {
    /// A constant collection of rows.
    Constant { rows: Vec<DiffRow> },
    /// Get CDC data from an source, be it external reference to an existing source or an internal
    /// reference to a `Let` identifier
    Get { id: Id },
    /// Create a temporary collection from given `value``, and make this bind only available
    /// in scope of `body`
    Let {
        id: LocalId,
        value: Box<Plan>,
        body: Box<Plan>,
    },
    /// Map, Filter, and Project operators.
    Mfp {
        /// The input collection.
        input: Box<Plan>,
        /// Linear operator to apply to each record.
        mfp: MapFilterProject,
    },
    /// Reduce operator, aggregation by key assembled from KeyValPlan
    Reduce {
        /// The input collection.
        input: Box<Plan>,
        /// A plan for changing input records into key, value pairs.
        key_val_plan: KeyValPlan,
        /// A plan for performing the reduce.
        ///
        /// The implementation of reduction has several different strategies based
        /// on the properties of the reduction, and the input itself.
        reduce_plan: ReducePlan,
    },
    /// A multiway relational equijoin, with fused map, filter, and projection.
    ///
    /// This stage performs a multiway join among `inputs`, using the equality
    /// constraints expressed in `plan`. The plan also describes the implementation
    /// strategy we will use, and any pushed down per-record work.
    Join {
        /// An ordered list of inputs that will be joined.
        inputs: Vec<Plan>,
        /// Detailed information about the implementation of the join.
        ///
        /// This includes information about the implementation strategy, but also
        /// any map, filter, project work that we might follow the join with, but
        /// potentially pushed down into the implementation of the join.
        plan: JoinPlan,
    },
    /// Adds the contents of the input collections.
    ///
    /// Importantly, this is *multiset* union, so the multiplicities of records will
    /// add. This is in contrast to *set* union, where the multiplicities would be
    /// capped at one. A set union can be formed with `Union` followed by `Reduce`
    /// implementing the "distinct" operator.
    Union {
        /// The input collections
        inputs: Vec<Plan>,
        /// Whether to consolidate the output, e.g., cancel negated records.
        consolidate_output: bool,
    },
 }
--- a/src/flow/src/plan/join.rs
+++ b/src/flow/src/plan/join.rs
@@ -0,0 +1,78 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use serde::{Deserialize, Serialize};
 use crate::expr::ScalarExpr;
 use crate::plan::SafeMfpPlan;
 /// TODO(discord9): consider impl more join strategies
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
 pub enum JoinPlan {
    Linear(LinearJoinPlan),
 }
 /// Determine if a given row should stay in the output. And apply a map filter project before output the row
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
 pub struct JoinFilter {
    /// each element in the outer vector will check if each expr in itself can be eval to same value
    /// if not, the row will be filtered out. Useful for equi-join(join based on equality of some columns)
    pub ready_equivalences: Vec<Vec<ScalarExpr>>,
    /// Apply a map filter project before output the row
    pub before: SafeMfpPlan,
 }
 /// A plan for the execution of a linear join.
 ///
 /// A linear join is a sequence of stages, each of which introduces
 /// a new collection. Each stage is represented by a [LinearStagePlan].
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
 pub struct LinearJoinPlan {
    /// The source relation from which we start the join.
    pub source_relation: usize,
    /// The arrangement to use for the source relation, if any
    pub source_key: Option<Vec<ScalarExpr>>,
    /// An initial closure to apply before any stages.
    ///
    /// Values of `None` indicate the identity closure.
    pub initial_closure: Option<JoinFilter>,
    /// A *sequence* of stages to apply one after the other.
    pub stage_plans: Vec<LinearStagePlan>,
    /// A concluding filter to apply after the last stage.
    ///
    /// Values of `None` indicate the identity closure.
    pub final_closure: Option<JoinFilter>,
 }
 /// A plan for the execution of one stage of a linear join.
 ///
 /// Each stage is a binary join between the current accumulated
 /// join results, and a new collection. The former is referred to
 /// as the "stream" and the latter the "lookup".
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
 pub struct LinearStagePlan {
    /// The index of the relation into which we will look up.
    pub lookup_relation: usize,
    /// The key expressions to use for the stream relation.
    pub stream_key: Vec<ScalarExpr>,
    /// Columns to retain from the stream relation.
    /// These columns are those that are not redundant with `stream_key`,
    /// and cannot be read out of the key component of an arrangement.
    pub stream_thinning: Vec<usize>,
    /// The key expressions to use for the lookup relation.
    pub lookup_key: Vec<ScalarExpr>,
    /// The closure to apply to the concatenation of the key columns,
    /// the stream value columns, and the lookup value colunms.
    pub closure: JoinFilter,
 }
--- a/src/flow/src/plan/reduce.rs
+++ b/src/flow/src/plan/reduce.rs
@@ -0,0 +1,50 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use serde::{Deserialize, Serialize};
 use crate::expr::{AggregateExpr, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr};
 #[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
 pub struct KeyValPlan {
    pub key_plan: SafeMfpPlan,
    pub val_plan: SafeMfpPlan,
 }
 /// TODO(discord9): def&impl of Hierarchical aggregates(for min/max with support to deletion) and
 /// basic aggregates(for other aggregate functions) and mixed aggregate
 #[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
 pub enum ReducePlan {
    /// Plan for not computing any aggregations, just determining the set of
    /// distinct keys.
    Distinct,
    /// Plan for computing only accumulable aggregations.
    /// Including simple functions like `sum`, `count`, `min/max`(without deletion)
    Accumulable(AccumulablePlan),
 }
 /// Accumulable plan for the execution of a reduction.
 #[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
 pub struct AccumulablePlan {
    /// All of the aggregations we were asked to compute, stored
    /// in order.
    pub full_aggrs: Vec<AggregateExpr>,
    /// All of the non-distinct accumulable aggregates.
    /// Each element represents:
    /// (index of aggr output, index of value among inputs, aggr expr)
    /// These will all be rendered together in one dataflow fragment.
    pub simple_aggrs: Vec<(usize, usize, AggregateExpr)>,
    /// Same as above but for all of the `DISTINCT` accumulable aggregations.
    pub distinct_aggrs: Vec<(usize, usize, AggregateExpr)>,
 }
--- a/src/flow/src/repr.rs
+++ b/src/flow/src/repr.rs
@@ -33,7 +33,10 @@ use snafu::ResultExt;
 use crate::expr::error::{CastValueSnafu, EvalError};
-/// System-wide Record count difference type.
+/// System-wide Record count difference type. Useful for capture data change
 ///
 /// i.e. +1 means insert one record, -1 means remove,
 /// and +/-n means insert/remove multiple duplicate records.
 pub type Diff = i64;
 /// System-wide default timestamp type
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -28,6 +28,7 @@ use api::v1::meta::Role;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use catalog::CatalogManagerRef;
 use client::OutputData;
 use common_base::Plugins;
 use common_config::KvBackendConfig;
 use common_error::ext::BoxedError;
@@ -401,13 +402,13 @@ impl SqlQueryHandler for Instance {
 /// Attaches a timer to the output and observes it once the output is exhausted.
 pub fn attach_timer(output: Output, timer: HistogramTimer) -> Output {
-    match output {
+    match output.data {
-        Output::AffectedRows(_) | Output::RecordBatches(_) => output,
+        OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => output,
-        Output::Stream(stream, plan) => {
+        OutputData::Stream(stream) => {
            let stream = OnDone::new(stream, move || {
                timer.observe_duration();
            });
-            Output::Stream(Box::pin(stream), plan)
+            Output::new(OutputData::Stream(Box::pin(stream)), output.meta)
        }
    }
 }
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -113,7 +113,7 @@ impl GrpcQueryHandler for Instance {
                            .statement_executor
                            .create_table_inner(&mut expr, None, &ctx)
                            .await?;
-                        Output::AffectedRows(0)
+                        Output::new_with_affected_rows(0)
                    }
                    DdlExpr::Alter(expr) => self.statement_executor.alter_table_inner(expr).await?,
                    DdlExpr::CreateDatabase(expr) => {
--- a/src/frontend/src/instance/opentsdb.rs
+++ b/src/frontend/src/instance/opentsdb.rs
@@ -47,8 +47,8 @@ impl OpentsdbProtocolHandler for Instance {
            .map_err(BoxedError::new)
            .context(servers::error::ExecuteGrpcQuerySnafu)?;
-        Ok(match output {
+        Ok(match output.data {
-            common_query::Output::AffectedRows(rows) => rows,
+            common_query::OutputData::AffectedRows(rows) => rows,
            _ => unreachable!(),
        })
    }
--- a/src/frontend/src/instance/prom_store.rs
+++ b/src/frontend/src/instance/prom_store.rs
@@ -19,6 +19,7 @@ use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse, Wri
 use api::v1::RowInsertRequests;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use client::OutputData;
 use common_catalog::format_full_table_name;
 use common_error::ext::BoxedError;
 use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
@@ -77,7 +78,7 @@ fn negotiate_response_type(accepted_response_types: &[i32]) -> ServerResult<Resp
 }
 async fn to_query_result(table_name: &str, output: Output) -> ServerResult<QueryResult> {
-    let Output::Stream(stream, _) = output else {
+    let OutputData::Stream(stream) = output.data else {
        unreachable!()
    };
    let recordbatches = RecordBatches::try_collect(stream)
--- a/src/log-store/src/raft_engine/backend.rs
+++ b/src/log-store/src/raft_engine/backend.rs
@@ -152,6 +152,10 @@ impl TxnService for RaftEngineBackend {
            responses,
        })
    }
    fn max_txn_ops(&self) -> usize {
        usize::MAX
    }
 }
 #[async_trait::async_trait]
--- a/src/meta-srv/examples/kv_store.rs
+++ b/src/meta-srv/examples/kv_store.rs
@@ -24,7 +24,9 @@ fn main() {
 #[tokio::main]
 async fn run() {
-    let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"]).await.unwrap();
+    let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"], 128)
        .await
        .unwrap();
    // put
    let put_req = PutRequest {
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -193,7 +193,8 @@ pub async fn metasrv_builder(
        (None, false) => {
            let etcd_client = create_etcd_client(opts).await?;
            let kv_backend = {
-                let etcd_backend = EtcdStore::with_etcd_client(etcd_client.clone());
+                let etcd_backend =
                    EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
                if !opts.store_key_prefix.is_empty() {
                    Arc::new(ChrootKvBackend::new(
                        opts.store_key_prefix.clone().into_bytes(),
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -79,6 +79,17 @@ pub struct MetaSrvOptions {
    pub wal: MetaSrvWalConfig,
    pub export_metrics: ExportMetricsOption,
    pub store_key_prefix: String,
    /// The max operations per txn
    ///
    /// This value is usually limited by which store is used for the `KvBackend`.
    /// For example, if using etcd, this value should ensure that it is less than
    /// or equal to the `--max-txn-ops` option value of etcd.
    ///
    /// TODO(jeremy): Currently, this option only affects the etcd store, but it may
    /// also affect other stores in the future. In other words, each store needs to
    /// limit the number of operations in a txn because an infinitely large txn could
    /// potentially block other operations.
    pub max_txn_ops: usize,
 }
 impl MetaSrvOptions {
@@ -112,6 +123,7 @@ impl Default for MetaSrvOptions {
            wal: MetaSrvWalConfig::default(),
            export_metrics: ExportMetricsOption::default(),
            store_key_prefix: String::new(),
            max_txn_ops: 128,
        }
    }
 }
--- a/src/meta-srv/src/mocks.rs
+++ b/src/meta-srv/src/mocks.rs
@@ -42,7 +42,7 @@ pub async fn mock_with_memstore() -> MockInfo {
 }
 pub async fn mock_with_etcdstore(addr: &str) -> MockInfo {
-    let kv_backend = EtcdStore::with_endpoints([addr]).await.unwrap();
+    let kv_backend = EtcdStore::with_endpoints([addr], 128).await.unwrap();
    mock(Default::default(), kv_backend, None, None).await
 }
--- a/src/meta-srv/src/service/store/cached_kv.rs
+++ b/src/meta-srv/src/service/store/cached_kv.rs
@@ -380,6 +380,10 @@ impl TxnService for LeaderCachedKvBackend {
        Ok(res)
    }
    fn max_txn_ops(&self) -> usize {
        self.store.max_txn_ops()
    }
 }
 impl ResettableKvBackend for LeaderCachedKvBackend {
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -79,5 +79,6 @@ rand.workspace = true
 toml.workspace = true
 [[bench]]
-name = "bench_merge_tree"
+name = "memtable_bench"
 harness = false
 required-features = ["test"]
--- a/src/mito2/README.md
+++ b/src/mito2/README.md
@@ -7,3 +7,9 @@ The Alfa Romeo [MiTo](https://en.wikipedia.org/wiki/Alfa_Romeo_MiTo) is a front-
 > "You can't be a true petrolhead until you've owned an Alfa Romeo."
 > <div align="right">-- by Jeremy Clarkson</div>
 ## Benchmarks
 Run benchmarks in this crate:
 ```bash
 cargo bench -p mito2 -F test
 ```
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -0,0 +1,352 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::sync::Arc;
 use api::v1::value::ValueData;
 use api::v1::{Row, Rows, SemanticType};
 use criterion::{criterion_group, criterion_main, Criterion};
 use datafusion_common::Column;
 use datafusion_expr::{lit, Expr};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::ColumnSchema;
 use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
 use mito2::memtable::time_series::TimeSeriesMemtable;
 use mito2::memtable::{KeyValues, Memtable};
 use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
 use rand::rngs::ThreadRng;
 use rand::seq::SliceRandom;
 use rand::Rng;
 use store_api::metadata::{
    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
 };
 use store_api::storage::RegionId;
 use table::predicate::Predicate;
 /// Writes rows.
 fn write_rows(c: &mut Criterion) {
    let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
    let timestamps = (0..100).collect::<Vec<_>>();
    // Note that this test only generate one time series.
    let mut group = c.benchmark_group("write");
    group.bench_function("merge_tree", |b| {
        let memtable =
            MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
        let kvs =
            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
        b.iter(|| {
            memtable.write(&kvs).unwrap();
        });
    });
    group.bench_function("time_series", |b| {
        let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
        let kvs =
            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
        b.iter(|| {
            memtable.write(&kvs).unwrap();
        });
    });
 }
 /// Scans all rows.
 fn full_scan(c: &mut Criterion) {
    let metadata = Arc::new(cpu_metadata());
    let config = MergeTreeConfig::default();
    let start_sec = 1710043200;
    let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
    let mut group = c.benchmark_group("full_scan");
    group.sample_size(10);
    group.bench_function("merge_tree", |b| {
        let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
        for kvs in generator.iter() {
            memtable.write(&kvs).unwrap();
        }
        b.iter(|| {
            let iter = memtable.iter(None, None).unwrap();
            for batch in iter {
                let _batch = batch.unwrap();
            }
        });
    });
    group.bench_function("time_series", |b| {
        let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
        for kvs in generator.iter() {
            memtable.write(&kvs).unwrap();
        }
        b.iter(|| {
            let iter = memtable.iter(None, None).unwrap();
            for batch in iter {
                let _batch = batch.unwrap();
            }
        });
    });
 }
 /// Filters 1 host.
 fn filter_1_host(c: &mut Criterion) {
    let metadata = Arc::new(cpu_metadata());
    let config = MergeTreeConfig::default();
    let start_sec = 1710043200;
    let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
    let mut group = c.benchmark_group("filter_1_host");
    group.sample_size(10);
    group.bench_function("merge_tree", |b| {
        let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
        for kvs in generator.iter() {
            memtable.write(&kvs).unwrap();
        }
        let predicate = generator.random_host_filter();
        b.iter(|| {
            let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
            for batch in iter {
                let _batch = batch.unwrap();
            }
        });
    });
    group.bench_function("time_series", |b| {
        let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
        for kvs in generator.iter() {
            memtable.write(&kvs).unwrap();
        }
        let predicate = generator.random_host_filter();
        b.iter(|| {
            let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
            for batch in iter {
                let _batch = batch.unwrap();
            }
        });
    });
 }
 struct Host {
    hostname: String,
    region: String,
    datacenter: String,
    rack: String,
    os: String,
    arch: String,
    team: String,
    service: String,
    service_version: String,
    service_environment: String,
 }
 impl Host {
    fn random_with_id(id: usize) -> Host {
        let mut rng = rand::thread_rng();
        let region = format!("ap-southeast-{}", rng.gen_range(0..10));
        let datacenter = format!(
            "{}{}",
            region,
            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
        );
        Host {
            hostname: format!("host_{id}"),
            region,
            datacenter,
            rack: rng.gen_range(0..100).to_string(),
            os: "Ubuntu16.04LTS".to_string(),
            arch: "x86".to_string(),
            team: "CHI".to_string(),
            service: rng.gen_range(0..100).to_string(),
            service_version: rng.gen_range(0..10).to_string(),
            service_environment: "test".to_string(),
        }
    }
    fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
        let tags = [
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.hostname.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.region.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.rack.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.os.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.arch.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.team.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.service.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.service_version.clone())),
            },
            api::v1::Value {
                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
            },
        ];
        for tag in tags {
            values.push(tag);
        }
    }
 }
 struct CpuDataGenerator {
    metadata: RegionMetadataRef,
    column_schemas: Vec<api::v1::ColumnSchema>,
    hosts: Vec<Host>,
    start_sec: i64,
    end_sec: i64,
 }
 impl CpuDataGenerator {
    fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
        let column_schemas = region_metadata_to_row_schema(&metadata);
        Self {
            metadata,
            column_schemas,
            hosts: Self::generate_hosts(num_hosts),
            start_sec,
            end_sec,
        }
    }
    fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
        // point per 10s.
        (self.start_sec..self.end_sec)
            .step_by(10)
            .enumerate()
            .map(|(seq, ts)| self.build_key_values(seq, ts))
    }
    fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
        let rows = self
            .hosts
            .iter()
            .map(|host| {
                let mut rng = rand::thread_rng();
                let mut values = Vec::with_capacity(21);
                values.push(api::v1::Value {
                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
                });
                host.fill_values(&mut values);
                for _ in 0..10 {
                    values.push(api::v1::Value {
                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
                    });
                }
                Row { values }
            })
            .collect();
        let mutation = api::v1::Mutation {
            op_type: api::v1::OpType::Put as i32,
            sequence: seq as u64,
            rows: Some(Rows {
                schema: self.column_schemas.clone(),
                rows,
            }),
        };
        KeyValues::new(&self.metadata, mutation).unwrap()
    }
    fn random_host_filter(&self) -> Predicate {
        let host = self.random_hostname();
        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
        Predicate::new(vec![expr.into()])
    }
    fn random_hostname(&self) -> String {
        let mut rng = rand::thread_rng();
        self.hosts.choose(&mut rng).unwrap().hostname.clone()
    }
    fn random_f64(rng: &mut ThreadRng) -> f64 {
        let base: u32 = rng.gen_range(30..95);
        base as f64
    }
    fn generate_hosts(num_hosts: usize) -> Vec<Host> {
        (0..num_hosts).map(Host::random_with_id).collect()
    }
 }
 /// Creates a metadata for TSBS cpu-like table.
 fn cpu_metadata() -> RegionMetadata {
    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
    builder.push_column_metadata(ColumnMetadata {
        column_schema: ColumnSchema::new(
            "ts",
            ConcreteDataType::timestamp_millisecond_datatype(),
            false,
        ),
        semantic_type: SemanticType::Timestamp,
        column_id: 0,
    });
    let mut column_id = 1;
    let tags = [
        "hostname",
        "region",
        "datacenter",
        "rack",
        "os",
        "arch",
        "team",
        "service",
        "service_version",
        "service_environment",
    ];
    for tag in tags {
        builder.push_column_metadata(ColumnMetadata {
            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
            semantic_type: SemanticType::Tag,
            column_id,
        });
        column_id += 1;
    }
    let fields = [
        "usage_user",
        "usage_system",
        "usage_idle",
        "usage_nice",
        "usage_iowait",
        "usage_irq",
        "usage_softirq",
        "usage_steal",
        "usage_guest",
        "usage_guest_nice",
    ];
    for field in fields {
        builder.push_column_metadata(ColumnMetadata {
            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
            semantic_type: SemanticType::Field,
            column_id,
        });
        column_id += 1;
    }
    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
    builder.build().unwrap()
 }
 criterion_group!(benches, write_rows, full_scan, filter_1_host);
 criterion_main!(benches);
--- a/src/mito2/benches/merge_tree_bench.rs
+++ b/src/mito2/benches/merge_tree_bench.rs
@@ -1,36 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use criterion::{criterion_group, criterion_main, Criterion};
 use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
 use mito2::memtable::Memtable;
 use mito2::test_util::memtable_util;
 fn bench_merge_tree_memtable(c: &mut Criterion) {
    let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
    let timestamps = (0..100).collect::<Vec<_>>();
    let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
    let _ = c.bench_function("MergeTreeMemtable", |b| {
        let kvs =
            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
        b.iter(|| {
            memtable.write(&kvs).unwrap();
        });
    });
 }
 criterion_group!(benches, bench_merge_tree_memtable);
 criterion_main!(benches);
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -158,7 +158,7 @@ impl CacheManager {
        }
    }
-    /// Gets the the write cache.
+    /// Gets the write cache.
    pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
        self.write_cache.as_ref()
    }
--- a/src/mito2/src/memtable/merge_tree.rs
+++ b/src/mito2/src/memtable/merge_tree.rs
@@ -85,7 +85,7 @@ impl Default for MergeTreeConfig {
        Self {
            index_max_keys_per_shard: 8192,
-            data_freeze_threshold: 32768,
+            data_freeze_threshold: 131072,
            dedup: true,
            fork_dictionary_bytes,
        }
@@ -293,6 +293,8 @@ mod tests {
    use std::collections::BTreeSet;
    use common_time::Timestamp;
    use datafusion_common::{Column, ScalarValue};
    use datafusion_expr::{BinaryExpr, Expr, Operator};
    use datatypes::scalars::ScalarVector;
    use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};
@@ -528,4 +530,55 @@ mod tests {
            .collect::<Vec<_>>();
        assert_eq!(expect, read);
    }
    #[test]
    fn test_memtable_filter() {
        let metadata = memtable_util::metadata_with_primary_key(vec![0, 1], false);
        // Try to build a memtable via the builder.
        let memtable = MergeTreeMemtableBuilder::new(
            MergeTreeConfig {
                index_max_keys_per_shard: 40,
                ..Default::default()
            },
            None,
        )
        .build(1, &metadata);
        for i in 0..100 {
            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
            let kvs =
                memtable_util::build_key_values(&metadata, "hello".to_string(), i, &timestamps, 1);
            memtable.write(&kvs).unwrap();
        }
        for i in 0..100 {
            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
            let expr = Expr::BinaryExpr(BinaryExpr {
                left: Box::new(Expr::Column(Column {
                    relation: None,
                    name: "k1".to_string(),
                })),
                op: Operator::Eq,
                right: Box::new(Expr::Literal(ScalarValue::UInt32(Some(i)))),
            });
            let iter = memtable
                .iter(None, Some(Predicate::new(vec![expr.into()])))
                .unwrap();
            let read = iter
                .flat_map(|batch| {
                    batch
                        .unwrap()
                        .timestamps()
                        .as_any()
                        .downcast_ref::<TimestampMillisecondVector>()
                        .unwrap()
                        .iter_data()
                        .collect::<Vec<_>>()
                        .into_iter()
                })
                .map(|v| v.unwrap().0.value())
                .collect::<Vec<_>>();
            assert_eq!(timestamps, read);
        }
    }
 }
--- a/src/mito2/src/memtable/merge_tree/data.rs
+++ b/src/mito2/src/memtable/merge_tree/data.rs
@@ -957,6 +957,18 @@ impl DataParts {
        self.active.write_row(pk_index, kv)
    }
    /// Returns the number of rows in the active buffer.
    pub fn num_active_rows(&self) -> usize {
        self.active.num_rows()
    }
    /// Freezes active buffer and creates a new active buffer.
    pub fn freeze(&mut self) -> Result<()> {
        let part = self.active.freeze(None, false)?;
        self.frozen.push(part);
        Ok(())
    }
    /// Reads data from all parts including active and frozen parts.
    /// The returned iterator yields a record batch of one primary key at a time.
    /// The order of yielding primary keys is determined by provided weights.
@@ -976,6 +988,11 @@ impl DataParts {
    pub(crate) fn is_empty(&self) -> bool {
        self.active.is_empty() && self.frozen.iter().all(|part| part.is_empty())
    }
    #[cfg(test)]
    pub(crate) fn frozen_len(&self) -> usize {
        self.frozen.len()
    }
 }
 pub struct DataPartsReaderBuilder {
@@ -994,9 +1011,11 @@ impl DataPartsReaderBuilder {
        for p in self.parts {
            nodes.push(DataNode::new(DataSource::Part(p)));
        }
        let num_parts = nodes.len();
        let merger = Merger::try_new(nodes)?;
        Ok(DataPartsReader {
            merger,
            num_parts,
            elapsed: Default::default(),
        })
    }
@@ -1005,6 +1024,7 @@ impl DataPartsReaderBuilder {
 /// Reader for all parts inside a `DataParts`.
 pub struct DataPartsReader {
    merger: Merger<DataNode>,
    num_parts: usize,
    elapsed: Duration,
 }
@@ -1032,6 +1052,10 @@ impl DataPartsReader {
    pub(crate) fn is_valid(&self) -> bool {
        self.merger.is_valid()
    }
    pub(crate) fn num_parts(&self) -> usize {
        self.num_parts
    }
 }
 #[cfg(test)]
--- a/src/mito2/src/memtable/merge_tree/dedup.rs
+++ b/src/mito2/src/memtable/merge_tree/dedup.rs
@@ -45,7 +45,7 @@ impl<T: DataBatchSource> DataBatchSource for DedupReader<T> {
    }
    fn next(&mut self) -> Result<()> {
-        loop {
+        while self.inner.is_valid() {
            match &mut self.prev_batch_last_row {
                None => {
                    // First shot, fill prev_batch_last_row and current_batch_range with first batch.
--- a/src/mito2/src/memtable/merge_tree/partition.rs
+++ b/src/mito2/src/memtable/merge_tree/partition.rs
@@ -78,7 +78,7 @@ impl Partition {
        // Finds key in shards, now we ensure one key only exists in one shard.
        if let Some(pk_id) = inner.find_key_in_shards(primary_key) {
-            inner.write_to_shard(pk_id, &key_value);
+            inner.write_to_shard(pk_id, &key_value)?;
            inner.num_rows += 1;
            return Ok(());
        }
@@ -106,7 +106,7 @@ impl Partition {
    }
    /// Writes to the partition without a primary key.
-    pub fn write_no_key(&self, key_value: KeyValue) {
+    pub fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
        let mut inner = self.inner.write().unwrap();
        // If no primary key, always write to the first shard.
        debug_assert!(!inner.shards.is_empty());
@@ -117,12 +117,24 @@ impl Partition {
            shard_id: 0,
            pk_index: 0,
        };
-        inner.shards[0].write_with_pk_id(pk_id, &key_value);
+        inner.shards[0].write_with_pk_id(pk_id, &key_value)?;
        inner.num_rows += 1;
        Ok(())
    }
    /// Scans data in the partition.
    pub fn read(&self, mut context: ReadPartitionContext) -> Result<PartitionReader> {
        let start = Instant::now();
        let key_filter = if context.need_prune_key {
            Some(PrimaryKeyFilter::new(
                context.metadata.clone(),
                context.filters.clone(),
                context.row_codec.clone(),
            ))
        } else {
            None
        };
        let (builder_source, shard_reader_builders) = {
            let inner = self.inner.read().unwrap();
            let mut shard_source = Vec::with_capacity(inner.shards.len() + 1);
@@ -141,14 +153,21 @@ impl Partition {
            (builder_reader, shard_source)
        };
        context.metrics.num_shards += shard_reader_builders.len();
        let mut nodes = shard_reader_builders
            .into_iter()
-            .map(|builder| Ok(ShardNode::new(ShardSource::Shard(builder.build()?))))
+            .map(|builder| {
                Ok(ShardNode::new(ShardSource::Shard(
                    builder.build(key_filter.clone())?,
                )))
            })
            .collect::<Result<Vec<_>>>()?;
        if let Some(builder) = builder_source {
            context.metrics.num_builder += 1;
            // Move the initialization of ShardBuilderReader out of read lock.
-            let shard_builder_reader = builder.build(Some(&context.pk_weights))?;
+            let shard_builder_reader =
                builder.build(Some(&context.pk_weights), key_filter.clone())?;
            nodes.push(ShardNode::new(ShardSource::Builder(shard_builder_reader)));
        }
@@ -156,8 +175,10 @@ impl Partition {
        let merger = ShardMerger::try_new(nodes)?;
        if self.dedup {
            let source = DedupReader::try_new(merger)?;
            context.metrics.build_partition_reader += start.elapsed();
            PartitionReader::new(context, Box::new(source))
        } else {
            context.metrics.build_partition_reader += start.elapsed();
            PartitionReader::new(context, Box::new(merger))
        }
    }
@@ -266,11 +287,11 @@ pub(crate) struct PartitionStats {
 #[derive(Default)]
 struct PartitionReaderMetrics {
-    prune_pk: Duration,
+    build_partition_reader: Duration,
    read_source: Duration,
    data_batch_to_batch: Duration,
-    keys_before_pruning: usize,
+    num_builder: usize,
-    keys_after_pruning: usize,
+    num_shards: usize,
 }
 /// Reader to scan rows in a partition.
@@ -279,18 +300,11 @@ struct PartitionReaderMetrics {
 pub struct PartitionReader {
    context: ReadPartitionContext,
    source: BoxedDataBatchSource,
    last_yield_pk_id: Option<PkId>,
 }
 impl PartitionReader {
    fn new(context: ReadPartitionContext, source: BoxedDataBatchSource) -> Result<Self> {
-        let mut reader = Self {
+        let reader = Self { context, source };
            context,
            source,
            last_yield_pk_id: None,
        };
        // Find next valid batch.
        reader.prune_batch_by_key()?;
        Ok(reader)
    }
@@ -305,8 +319,7 @@ impl PartitionReader {
    /// # Panics
    /// Panics if the reader is invalid.
    pub fn next(&mut self) -> Result<()> {
-        self.advance_source()?;
+        self.advance_source()
        self.prune_batch_by_key()
    }
    /// Converts current data batch into a [Batch].
@@ -336,106 +349,77 @@ impl PartitionReader {
        self.context.metrics.read_source += read_source.elapsed();
        Ok(())
    }
    fn prune_batch_by_key(&mut self) -> Result<()> {
        if self.context.metadata.primary_key.is_empty() || !self.context.need_prune_key {
            // Nothing to prune.
            return Ok(());
        }
        while self.source.is_valid() {
            let pk_id = self.source.current_pk_id();
            if let Some(yield_pk_id) = self.last_yield_pk_id {
                if pk_id == yield_pk_id {
                    // If this batch has the same key as last returned batch.
                    // We can return it without evaluating filters.
                    break;
                }
            }
            let key = self.source.current_key().unwrap();
            self.context.metrics.keys_before_pruning += 1;
            // Prune batch by primary key.
            if prune_primary_key(
                &self.context.metadata,
                &self.context.filters,
                &self.context.row_codec,
                key,
                &mut self.context.metrics,
            ) {
                // We need this key.
                self.last_yield_pk_id = Some(pk_id);
                self.context.metrics.keys_after_pruning += 1;
                break;
            }
            self.advance_source()?;
        }
        Ok(())
    }
 }
-fn prune_primary_key(
+#[derive(Clone)]
-    metadata: &RegionMetadataRef,
+pub(crate) struct PrimaryKeyFilter {
-    filters: &[SimpleFilterEvaluator],
+    metadata: RegionMetadataRef,
-    codec: &McmpRowCodec,
+    filters: Arc<Vec<SimpleFilterEvaluator>>,
-    pk: &[u8],
+    codec: Arc<McmpRowCodec>,
-    metrics: &mut PartitionReaderMetrics,
+    offsets_buf: Vec<usize>,
 ) -> bool {
    let start = Instant::now();
    let res = prune_primary_key_inner(metadata, filters, codec, pk);
    metrics.prune_pk += start.elapsed();
    res
 }
-// TODO(yingwen): Improve performance of key pruning. Now we need to find index and
+impl PrimaryKeyFilter {
-// then decode and convert each value.
+    pub(crate) fn new(
-/// Returns true if the `pk` is still needed.
+        metadata: RegionMetadataRef,
-fn prune_primary_key_inner(
+        filters: Arc<Vec<SimpleFilterEvaluator>>,
-    metadata: &RegionMetadataRef,
+        codec: Arc<McmpRowCodec>,
-    filters: &[SimpleFilterEvaluator],
+    ) -> Self {
-    codec: &McmpRowCodec,
+        Self {
-    pk: &[u8],
+            metadata,
-) -> bool {
+            filters,
-    if filters.is_empty() {
+            codec,
-        return true;
+            offsets_buf: Vec::new(),
        }
    }
-    // no primary key, we simply return true.
+    pub(crate) fn prune_primary_key(&mut self, pk: &[u8]) -> bool {
-    if metadata.primary_key.is_empty() {
+        if self.filters.is_empty() {
        return true;
    }
    let pk_values = match codec.decode(pk) {
        Ok(values) => values,
        Err(e) => {
            common_telemetry::error!(e; "Failed to decode primary key");
            return true;
        }
    };
-    // evaluate filters against primary key values
+        // no primary key, we simply return true.
-    let mut result = true;
+        if self.metadata.primary_key.is_empty() {
-    for filter in filters {
+            return true;
        if Partition::is_partition_column(filter.column_name()) {
            continue;
        }
-        let Some(column) = metadata.column_by_name(filter.column_name()) else {
+
-            continue;
+        // evaluate filters against primary key values
-        };
+        let mut result = true;
-        // ignore filters that are not referencing primary key columns
+        self.offsets_buf.clear();
-        if column.semantic_type != SemanticType::Tag {
+        for filter in &*self.filters {
-            continue;
+            if Partition::is_partition_column(filter.column_name()) {
                continue;
            }
            let Some(column) = self.metadata.column_by_name(filter.column_name()) else {
                continue;
            };
            // ignore filters that are not referencing primary key columns
            if column.semantic_type != SemanticType::Tag {
                continue;
            }
            // index of the column in primary keys.
            // Safety: A tag column is always in primary key.
            let index = self.metadata.primary_key_index(column.column_id).unwrap();
            let value = match self.codec.decode_value_at(pk, index, &mut self.offsets_buf) {
                Ok(v) => v,
                Err(e) => {
                    common_telemetry::error!(e; "Failed to decode primary key");
                    return true;
                }
            };
            // TODO(yingwen): `evaluate_scalar()` creates temporary arrays to compare scalars. We
            // can compare the bytes directly without allocation and matching types as we use
            // comparable encoding.
            // Safety: arrow schema and datatypes are constructed from the same source.
            let scalar_value = value
                .try_to_scalar_value(&column.column_schema.data_type)
                .unwrap();
            result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
        }
-        // index of the column in primary keys.
+
-        // Safety: A tag column is always in primary key.
+        result
        let index = metadata.primary_key_index(column.column_id).unwrap();
        // Safety: arrow schema and datatypes are constructed from the same source.
        let scalar_value = pk_values[index]
            .try_to_scalar_value(&column.column_schema.data_type)
            .unwrap();
        result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
    }
    result
 }
 /// Structs to reuse across readers to avoid allocating for each reader.
@@ -443,7 +427,7 @@ pub(crate) struct ReadPartitionContext {
    metadata: RegionMetadataRef,
    row_codec: Arc<McmpRowCodec>,
    projection: HashSet<ColumnId>,
-    filters: Vec<SimpleFilterEvaluator>,
+    filters: Arc<Vec<SimpleFilterEvaluator>>,
    /// Buffer to store pk weights.
    pk_weights: Vec<u16>,
    need_prune_key: bool,
@@ -452,10 +436,6 @@ pub(crate) struct ReadPartitionContext {
 impl Drop for ReadPartitionContext {
    fn drop(&mut self) {
        let partition_prune_pk = self.metrics.prune_pk.as_secs_f64();
        MERGE_TREE_READ_STAGE_ELAPSED
            .with_label_values(&["partition_prune_pk"])
            .observe(partition_prune_pk);
        let partition_read_source = self.metrics.read_source.as_secs_f64();
        MERGE_TREE_READ_STAGE_ELAPSED
            .with_label_values(&["partition_read_source"])
@@ -465,16 +445,19 @@ impl Drop for ReadPartitionContext {
            .with_label_values(&["partition_data_batch_to_batch"])
            .observe(partition_data_batch_to_batch);
-        if self.metrics.keys_before_pruning != 0 {
+        common_telemetry::debug!(
-            common_telemetry::debug!(
+            "TreeIter partitions metrics, \
-                "TreeIter pruning, before: {}, after: {}, partition_read_source: {}s, partition_prune_pk: {}s, partition_data_batch_to_batch: {}s",
+            num_builder: {}, \
-                self.metrics.keys_before_pruning,
+            num_shards: {}, \
-                self.metrics.keys_after_pruning,
+            build_partition_reader: {}s, \
-                partition_read_source,
+            partition_read_source: {}s, \
-                partition_prune_pk,
+            partition_data_batch_to_batch: {}s",
-                partition_data_batch_to_batch,
+            self.metrics.num_builder,
-            );
+            self.metrics.num_shards,
-        }
+            self.metrics.build_partition_reader.as_secs_f64(),
            partition_read_source,
            partition_data_batch_to_batch,
        );
    }
 }
@@ -490,7 +473,7 @@ impl ReadPartitionContext {
            metadata,
            row_codec,
            projection,
-            filters,
+            filters: Arc::new(filters),
            pk_weights: Vec::new(),
            need_prune_key,
            metrics: Default::default(),
@@ -578,7 +561,16 @@ impl Inner {
    fn new(metadata: RegionMetadataRef, config: &MergeTreeConfig) -> Self {
        let (shards, current_shard_id) = if metadata.primary_key.is_empty() {
            let data_parts = DataParts::new(metadata.clone(), DATA_INIT_CAP, config.dedup);
-            (vec![Shard::new(0, None, data_parts, config.dedup)], 1)
+            (
                vec![Shard::new(
                    0,
                    None,
                    data_parts,
                    config.dedup,
                    config.data_freeze_threshold,
                )],
                1,
            )
        } else {
            (Vec::new(), 0)
        };
@@ -598,18 +590,22 @@ impl Inner {
        self.pk_to_pk_id.get(primary_key).copied()
    }
-    fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) {
+    fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
        if pk_id.shard_id == self.shard_builder.current_shard_id() {
            self.shard_builder.write_with_pk_id(pk_id, key_value);
-            return;
+            return Ok(());
        }
        for shard in &mut self.shards {
            if shard.shard_id == pk_id.shard_id {
                shard.write_with_pk_id(pk_id, key_value);
                self.num_rows += 1;
                return;
            }
        }
        // Safety: We find the shard by shard id.
        let shard = self
            .shards
            .iter_mut()
            .find(|shard| shard.shard_id == pk_id.shard_id)
            .unwrap();
        shard.write_with_pk_id(pk_id, key_value)?;
        self.num_rows += 1;
        Ok(())
    }
    fn freeze_active_shard(&mut self) -> Result<()> {
--- a/src/mito2/src/memtable/merge_tree/shard.rs
+++ b/src/mito2/src/memtable/merge_tree/shard.rs
@@ -15,6 +15,7 @@
 //! Shard in a partition.
 use std::cmp::Ordering;
 use std::time::{Duration, Instant};
 use store_api::metadata::RegionMetadataRef;
@@ -25,8 +26,10 @@ use crate::memtable::merge_tree::data::{
 };
 use crate::memtable::merge_tree::dict::KeyDictRef;
 use crate::memtable::merge_tree::merger::{Merger, Node};
 use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
 use crate::memtable::merge_tree::shard_builder::ShardBuilderReader;
-use crate::memtable::merge_tree::{PkId, ShardId};
+use crate::memtable::merge_tree::{PkId, PkIndex, ShardId};
 use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
 /// Shard stores data related to the same key dictionary.
 pub struct Shard {
@@ -36,6 +39,8 @@ pub struct Shard {
    /// Data in the shard.
    data_parts: DataParts,
    dedup: bool,
    /// Number of rows to freeze a data part.
    data_freeze_threshold: usize,
 }
 impl Shard {
@@ -45,20 +50,29 @@ impl Shard {
        key_dict: Option<KeyDictRef>,
        data_parts: DataParts,
        dedup: bool,
        data_freeze_threshold: usize,
    ) -> Shard {
        Shard {
            shard_id,
            key_dict,
            data_parts,
            dedup,
            data_freeze_threshold,
        }
    }
    /// Writes a key value into the shard.
-    pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) {
+    ///
    /// It will freezes the active buffer if it is full.
    pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
        debug_assert_eq!(self.shard_id, pk_id.shard_id);
        if self.data_parts.num_active_rows() >= self.data_freeze_threshold {
            self.data_parts.freeze()?;
        }
        self.data_parts.write_row(pk_id.pk_index, key_value);
        Ok(())
    }
    /// Scans the shard.
@@ -80,6 +94,7 @@ impl Shard {
            key_dict: self.key_dict.clone(),
            data_parts: DataParts::new(metadata, DATA_INIT_CAP, self.dedup),
            dedup: self.dedup,
            data_freeze_threshold: self.data_freeze_threshold,
        }
    }
@@ -131,18 +146,15 @@ pub struct ShardReaderBuilder {
 }
 impl ShardReaderBuilder {
-    pub(crate) fn build(self) -> Result<ShardReader> {
+    pub(crate) fn build(self, key_filter: Option<PrimaryKeyFilter>) -> Result<ShardReader> {
        let ShardReaderBuilder {
            shard_id,
            key_dict,
            inner,
        } = self;
        let now = Instant::now();
        let parts_reader = inner.build()?;
-        Ok(ShardReader {
+        ShardReader::new(shard_id, key_dict, parts_reader, key_filter, now.elapsed())
            shard_id,
            key_dict,
            parts_reader,
        })
    }
 }
@@ -151,15 +163,46 @@ pub struct ShardReader {
    shard_id: ShardId,
    key_dict: Option<KeyDictRef>,
    parts_reader: DataPartsReader,
    key_filter: Option<PrimaryKeyFilter>,
    last_yield_pk_index: Option<PkIndex>,
    keys_before_pruning: usize,
    keys_after_pruning: usize,
    prune_pk_cost: Duration,
    data_build_cost: Duration,
 }
 impl ShardReader {
    fn new(
        shard_id: ShardId,
        key_dict: Option<KeyDictRef>,
        parts_reader: DataPartsReader,
        key_filter: Option<PrimaryKeyFilter>,
        data_build_cost: Duration,
    ) -> Result<Self> {
        let has_pk = key_dict.is_some();
        let mut reader = Self {
            shard_id,
            key_dict,
            parts_reader,
            key_filter: if has_pk { key_filter } else { None },
            last_yield_pk_index: None,
            keys_before_pruning: 0,
            keys_after_pruning: 0,
            prune_pk_cost: Duration::default(),
            data_build_cost,
        };
        reader.prune_batch_by_key()?;
        Ok(reader)
    }
    fn is_valid(&self) -> bool {
        self.parts_reader.is_valid()
    }
    fn next(&mut self) -> Result<()> {
-        self.parts_reader.next()
+        self.parts_reader.next()?;
        self.prune_batch_by_key()
    }
    fn current_key(&self) -> Option<&[u8]> {
@@ -180,6 +223,54 @@ impl ShardReader {
    fn current_data_batch(&self) -> DataBatch {
        self.parts_reader.current_data_batch()
    }
    fn prune_batch_by_key(&mut self) -> Result<()> {
        let Some(key_filter) = &mut self.key_filter else {
            return Ok(());
        };
        while self.parts_reader.is_valid() {
            let pk_index = self.parts_reader.current_data_batch().pk_index();
            if let Some(yield_pk_index) = self.last_yield_pk_index {
                if pk_index == yield_pk_index {
                    break;
                }
            }
            self.keys_before_pruning += 1;
            // Safety: `key_filter` is some so the shard has primary keys.
            let key = self.key_dict.as_ref().unwrap().key_by_pk_index(pk_index);
            let now = Instant::now();
            if key_filter.prune_primary_key(key) {
                self.prune_pk_cost += now.elapsed();
                self.last_yield_pk_index = Some(pk_index);
                self.keys_after_pruning += 1;
                break;
            }
            self.prune_pk_cost += now.elapsed();
            self.parts_reader.next()?;
        }
        Ok(())
    }
 }
 impl Drop for ShardReader {
    fn drop(&mut self) {
        let shard_prune_pk = self.prune_pk_cost.as_secs_f64();
        MERGE_TREE_READ_STAGE_ELAPSED
            .with_label_values(&["shard_prune_pk"])
            .observe(shard_prune_pk);
        if self.keys_before_pruning > 0 {
            common_telemetry::debug!(
                "ShardReader metrics, data parts: {}, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
                self.parts_reader.num_parts(),
                self.keys_before_pruning,
                self.keys_after_pruning,
                shard_prune_pk,
                self.data_build_cost.as_secs_f64(),
            );
        }
    }
 }
 /// A merger that merges batches from multiple shards.
@@ -388,6 +479,7 @@ mod tests {
        shard_id: ShardId,
        metadata: RegionMetadataRef,
        input: &[(KeyValues, PkIndex)],
        data_freeze_threshold: usize,
    ) -> Shard {
        let mut dict_builder = KeyDictBuilder::new(1024);
        let mut metrics = WriteMetrics::default();
@@ -402,27 +494,17 @@ mod tests {
        let dict = dict_builder.finish(&mut BTreeMap::new()).unwrap();
        let data_parts = DataParts::new(metadata, DATA_INIT_CAP, true);
-        Shard::new(shard_id, Some(Arc::new(dict)), data_parts, true)
+        Shard::new(
            shard_id,
            Some(Arc::new(dict)),
            data_parts,
            true,
            data_freeze_threshold,
        )
    }
-    #[test]
+    fn collect_timestamps(shard: &Shard) -> Vec<i64> {
-    fn test_write_read_shard() {
+        let mut reader = shard.read().unwrap().build(None).unwrap();
        let metadata = metadata_for_test();
        let input = input_with_key(&metadata);
        let mut shard = new_shard_with_dict(8, metadata, &input);
        assert!(shard.is_empty());
        for (key_values, pk_index) in &input {
            for kv in key_values.iter() {
                let pk_id = PkId {
                    shard_id: shard.shard_id,
                    pk_index: *pk_index,
                };
                shard.write_with_pk_id(pk_id, &kv);
            }
        }
        assert!(!shard.is_empty());
        let mut reader = shard.read().unwrap().build().unwrap();
        let mut timestamps = Vec::new();
        while reader.is_valid() {
            let rb = reader.current_data_batch().slice_record_batch();
@@ -432,6 +514,64 @@ mod tests {
            reader.next().unwrap();
        }
        timestamps
    }
    #[test]
    fn test_write_read_shard() {
        let metadata = metadata_for_test();
        let input = input_with_key(&metadata);
        let mut shard = new_shard_with_dict(8, metadata, &input, 100);
        assert!(shard.is_empty());
        for (key_values, pk_index) in &input {
            for kv in key_values.iter() {
                let pk_id = PkId {
                    shard_id: shard.shard_id,
                    pk_index: *pk_index,
                };
                shard.write_with_pk_id(pk_id, &kv).unwrap();
            }
        }
        assert!(!shard.is_empty());
        let timestamps = collect_timestamps(&shard);
        assert_eq!(vec![0, 1, 10, 11, 20, 21], timestamps);
    }
    #[test]
    fn test_shard_freeze() {
        let metadata = metadata_for_test();
        let kvs = build_key_values_with_ts_seq_values(
            &metadata,
            "shard".to_string(),
            0,
            [0].into_iter(),
            [Some(0.0)].into_iter(),
            0,
        );
        let mut shard = new_shard_with_dict(8, metadata.clone(), &[(kvs, 0)], 50);
        let expected: Vec<_> = (0..200).collect();
        for i in &expected {
            let kvs = build_key_values_with_ts_seq_values(
                &metadata,
                "shard".to_string(),
                0,
                [*i].into_iter(),
                [Some(0.0)].into_iter(),
                *i as u64,
            );
            let pk_id = PkId {
                shard_id: shard.shard_id,
                pk_index: *i as PkIndex,
            };
            for kv in kvs.iter() {
                shard.write_with_pk_id(pk_id, &kv).unwrap();
            }
        }
        assert!(!shard.is_empty());
        assert_eq!(3, shard.data_parts.frozen_len());
        let timestamps = collect_timestamps(&shard);
        assert_eq!(expected, timestamps);
    }
 }
--- a/src/mito2/src/memtable/merge_tree/shard_builder.rs
+++ b/src/mito2/src/memtable/merge_tree/shard_builder.rs
@@ -16,6 +16,7 @@
 use std::collections::{BTreeMap, HashMap};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use store_api::metadata::RegionMetadataRef;
@@ -26,8 +27,9 @@ use crate::memtable::merge_tree::data::{
 };
 use crate::memtable::merge_tree::dict::{DictBuilderReader, KeyDictBuilder};
 use crate::memtable::merge_tree::metrics::WriteMetrics;
 use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
 use crate::memtable::merge_tree::shard::Shard;
-use crate::memtable::merge_tree::{MergeTreeConfig, PkId, ShardId};
+use crate::memtable::merge_tree::{MergeTreeConfig, PkId, PkIndex, ShardId};
 use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
 /// Builder to write keys and data to a shard that the key dictionary
@@ -136,7 +138,13 @@ impl ShardBuilder {
        let shard_id = self.current_shard_id;
        self.current_shard_id += 1;
-        Ok(Some(Shard::new(shard_id, key_dict, data_parts, self.dedup)))
+        Ok(Some(Shard::new(
            shard_id,
            key_dict,
            data_parts,
            self.dedup,
            self.data_freeze_threshold,
        )))
    }
    /// Scans the shard builder.
@@ -176,13 +184,20 @@ pub(crate) struct ShardBuilderReaderBuilder {
 }
 impl ShardBuilderReaderBuilder {
-    pub(crate) fn build(self, pk_weights: Option<&[u16]>) -> Result<ShardBuilderReader> {
+    pub(crate) fn build(
        self,
        pk_weights: Option<&[u16]>,
        key_filter: Option<PrimaryKeyFilter>,
    ) -> Result<ShardBuilderReader> {
        let now = Instant::now();
        let data_reader = self.data_reader.build(pk_weights)?;
-        Ok(ShardBuilderReader {
+        ShardBuilderReader::new(
-            shard_id: self.shard_id,
+            self.shard_id,
-            dict_reader: self.dict_reader,
+            self.dict_reader,
            data_reader,
-        })
+            key_filter,
            now.elapsed(),
        )
    }
 }
@@ -191,15 +206,45 @@ pub struct ShardBuilderReader {
    shard_id: ShardId,
    dict_reader: DictBuilderReader,
    data_reader: DataBufferReader,
    key_filter: Option<PrimaryKeyFilter>,
    last_yield_pk_index: Option<PkIndex>,
    keys_before_pruning: usize,
    keys_after_pruning: usize,
    prune_pk_cost: Duration,
    data_build_cost: Duration,
 }
 impl ShardBuilderReader {
    fn new(
        shard_id: ShardId,
        dict_reader: DictBuilderReader,
        data_reader: DataBufferReader,
        key_filter: Option<PrimaryKeyFilter>,
        data_build_cost: Duration,
    ) -> Result<Self> {
        let mut reader = ShardBuilderReader {
            shard_id,
            dict_reader,
            data_reader,
            key_filter,
            last_yield_pk_index: None,
            keys_before_pruning: 0,
            keys_after_pruning: 0,
            prune_pk_cost: Duration::default(),
            data_build_cost,
        };
        reader.prune_batch_by_key()?;
        Ok(reader)
    }
    pub fn is_valid(&self) -> bool {
        self.data_reader.is_valid()
    }
    pub fn next(&mut self) -> Result<()> {
-        self.data_reader.next()
+        self.data_reader.next()?;
        self.prune_batch_by_key()
    }
    pub fn current_key(&self) -> Option<&[u8]> {
@@ -218,6 +263,52 @@ impl ShardBuilderReader {
    pub fn current_data_batch(&self) -> DataBatch {
        self.data_reader.current_data_batch()
    }
    fn prune_batch_by_key(&mut self) -> Result<()> {
        let Some(key_filter) = &mut self.key_filter else {
            return Ok(());
        };
        while self.data_reader.is_valid() {
            let pk_index = self.data_reader.current_data_batch().pk_index();
            if let Some(yield_pk_index) = self.last_yield_pk_index {
                if pk_index == yield_pk_index {
                    break;
                }
            }
            self.keys_before_pruning += 1;
            let key = self.dict_reader.key_by_pk_index(pk_index);
            let now = Instant::now();
            if key_filter.prune_primary_key(key) {
                self.prune_pk_cost += now.elapsed();
                self.last_yield_pk_index = Some(pk_index);
                self.keys_after_pruning += 1;
                break;
            }
            self.prune_pk_cost += now.elapsed();
            self.data_reader.next()?;
        }
        Ok(())
    }
 }
 impl Drop for ShardBuilderReader {
    fn drop(&mut self) {
        let shard_builder_prune_pk = self.prune_pk_cost.as_secs_f64();
        MERGE_TREE_READ_STAGE_ELAPSED
            .with_label_values(&["shard_builder_prune_pk"])
            .observe(shard_builder_prune_pk);
        if self.keys_before_pruning > 0 {
            common_telemetry::debug!(
                "ShardBuilderReader metrics, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
                self.keys_before_pruning,
                self.keys_after_pruning,
                shard_builder_prune_pk,
                self.data_build_cost.as_secs_f64(),
            );
        }
    }
 }
 #[cfg(test)]
@@ -306,7 +397,7 @@ mod tests {
        let mut reader = shard_builder
            .read(&mut pk_weights)
            .unwrap()
-            .build(Some(&pk_weights))
+            .build(Some(&pk_weights), None)
            .unwrap();
        let mut timestamps = Vec::new();
        while reader.is_valid() {
--- a/src/mito2/src/memtable/merge_tree/tree.rs
+++ b/src/mito2/src/memtable/merge_tree/tree.rs
@@ -124,7 +124,7 @@ impl MergeTree {
            if !has_pk {
                // No primary key.
-                self.write_no_key(kv);
+                self.write_no_key(kv)?;
                continue;
            }
@@ -299,7 +299,7 @@ impl MergeTree {
        )
    }
-    fn write_no_key(&self, key_value: KeyValue) {
+    fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
        let partition_key = Partition::get_partition_key(&key_value, self.is_partitioned);
        let partition = self.get_or_create_partition(partition_key);
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -171,6 +171,8 @@ impl RegionOpener {
        // Initial memtable id is 0.
        let mutable = self.memtable_builder.build(0, &metadata);
        debug!("Create region {} with options: {:?}", region_id, options);
        let version = VersionBuilder::new(metadata, mutable)
            .options(options)
            .build();
@@ -249,6 +251,9 @@ impl RegionOpener {
        let region_id = self.region_id;
        let object_store = self.object_store(&region_options.storage)?.clone();
        debug!("Open region {} with options: {:?}", region_id, self.options);
        let access_layer = Arc::new(AccessLayer::new(
            self.region_dir.clone(),
            object_store,
--- a/src/mito2/src/region/options.rs
+++ b/src/mito2/src/region/options.rs
@@ -13,6 +13,8 @@
 // limitations under the License.
 //! Options for a region.
 //!
 //! If we add options in this mod, we also need to modify [store_api::mito_engine_options].
 use std::collections::HashMap;
 use std::time::Duration;
@@ -358,6 +360,7 @@ mod tests {
            ("compaction.type", "twcs"),
            ("storage", "S3"),
            ("index.inverted_index.ignore_column_ids", "1,2,3"),
            ("index.inverted_index.segment_row_count", "512"),
            (
                WAL_OPTIONS_KEY,
                &serde_json::to_string(&wal_options).unwrap(),
@@ -376,7 +379,7 @@ mod tests {
            index_options: IndexOptions {
                inverted_index: InvertedIndexOptions {
                    ignore_column_ids: vec![1, 2, 3],
-                    segment_row_count: 1024,
+                    segment_row_count: 512,
                },
            },
        };
--- a/src/mito2/src/row_converter.rs
+++ b/src/mito2/src/row_converter.rs
@@ -215,6 +215,61 @@ impl SortField {
            Decimal128, Decimal128
        )
    }
    /// Skip deserializing this field, returns the length of it.
    fn skip_deserialize(
        &self,
        bytes: &[u8],
        deserializer: &mut Deserializer<&[u8]>,
    ) -> Result<usize> {
        let pos = deserializer.position();
        if bytes[pos] == 0 {
            deserializer.advance(1);
            return Ok(1);
        }
        let to_skip = match &self.data_type {
            ConcreteDataType::Boolean(_) => 2,
            ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
            ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
            ConcreteDataType::Int32(_) | ConcreteDataType::UInt32(_) => 5,
            ConcreteDataType::Int64(_) | ConcreteDataType::UInt64(_) => 9,
            ConcreteDataType::Float32(_) => 5,
            ConcreteDataType::Float64(_) => 9,
            ConcreteDataType::Binary(_) => {
                // Now the encoder encode binary as a list of bytes so we can't use
                // skip bytes.
                let pos_before = deserializer.position();
                let mut current = pos_before + 1;
                while bytes[current] == 1 {
                    current += 2;
                }
                let to_skip = current - pos_before + 1;
                deserializer.advance(to_skip);
                return Ok(to_skip);
            }
            ConcreteDataType::String(_) => {
                let pos_before = deserializer.position();
                deserializer.advance(1);
                deserializer
                    .skip_bytes()
                    .context(error::DeserializeFieldSnafu)?;
                return Ok(deserializer.position() - pos_before);
            }
            ConcreteDataType::Date(_) => 5,
            ConcreteDataType::DateTime(_) => 9,
            ConcreteDataType::Timestamp(_) => 9, // We treat timestamp as Option<i64>
            ConcreteDataType::Time(_) => 10,     // i64 and 1 byte time unit
            ConcreteDataType::Duration(_) => 10,
            ConcreteDataType::Interval(_) => 18,
            ConcreteDataType::Decimal128(_) => 19,
            ConcreteDataType::Null(_)
            | ConcreteDataType::List(_)
            | ConcreteDataType::Dictionary(_) => 0,
        };
        deserializer.advance(to_skip);
        Ok(to_skip)
    }
 }
 /// A memory-comparable row [Value] encoder/decoder.
@@ -236,6 +291,52 @@ impl McmpRowCodec {
    pub fn estimated_size(&self) -> usize {
        self.fields.iter().map(|f| f.estimated_size()).sum()
    }
    /// Decode value at `pos` in `bytes`.
    ///
    /// The i-th element in offsets buffer is how many bytes to skip in order to read value at `pos`.
    pub fn decode_value_at(
        &self,
        bytes: &[u8],
        pos: usize,
        offsets_buf: &mut Vec<usize>,
    ) -> Result<Value> {
        let mut deserializer = Deserializer::new(bytes);
        if pos < offsets_buf.len() {
            // We computed the offset before.
            let to_skip = offsets_buf[pos];
            deserializer.advance(to_skip);
            return self.fields[pos].deserialize(&mut deserializer);
        }
        if offsets_buf.is_empty() {
            let mut offset = 0;
            // Skip values before `pos`.
            for i in 0..pos {
                // Offset to skip before reading value i.
                offsets_buf.push(offset);
                let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
                offset += skip;
            }
            // Offset to skip before reading this value.
            offsets_buf.push(offset);
        } else {
            // Offsets are not enough.
            let value_start = offsets_buf.len() - 1;
            // Advances to decode value at `value_start`.
            let mut offset = offsets_buf[value_start];
            deserializer.advance(offset);
            for i in value_start..pos {
                // Skip value i.
                let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
                // Offset for the value at i + 1.
                offset += skip;
                offsets_buf.push(offset);
            }
        }
        self.fields[pos].deserialize(&mut deserializer)
    }
 }
 impl RowCodec for McmpRowCodec {
@@ -274,7 +375,7 @@ impl RowCodec for McmpRowCodec {
 #[cfg(test)]
 mod tests {
    use common_base::bytes::StringBytes;
-    use common_time::Timestamp;
+    use common_time::{DateTime, Timestamp};
    use datatypes::value::Value;
    use super::*;
@@ -292,6 +393,18 @@ mod tests {
        let result = encoder.encode(value_ref.iter().cloned()).unwrap();
        let decoded = encoder.decode(&result).unwrap();
        assert_eq!(decoded, row);
        let mut decoded = Vec::new();
        let mut offsets = Vec::new();
        // Iter two times to test offsets buffer.
        for _ in 0..2 {
            decoded.clear();
            for i in 0..data_types.len() {
                let value = encoder.decode_value_at(&result, i, &mut offsets).unwrap();
                decoded.push(value);
            }
            assert_eq!(data_types.len(), offsets.len(), "offsets: {:?}", offsets);
            assert_eq!(decoded, row);
        }
    }
    #[test]
@@ -416,5 +529,53 @@ mod tests {
            ],
            vec![Value::Null, Value::Int64(43), Value::Boolean(true)],
        );
        // All types.
        check_encode_and_decode(
            &[
                ConcreteDataType::boolean_datatype(),
                ConcreteDataType::int8_datatype(),
                ConcreteDataType::uint8_datatype(),
                ConcreteDataType::int16_datatype(),
                ConcreteDataType::uint16_datatype(),
                ConcreteDataType::int32_datatype(),
                ConcreteDataType::uint32_datatype(),
                ConcreteDataType::int64_datatype(),
                ConcreteDataType::uint64_datatype(),
                ConcreteDataType::float32_datatype(),
                ConcreteDataType::float64_datatype(),
                ConcreteDataType::binary_datatype(),
                ConcreteDataType::string_datatype(),
                ConcreteDataType::date_datatype(),
                ConcreteDataType::datetime_datatype(),
                ConcreteDataType::timestamp_millisecond_datatype(),
                ConcreteDataType::time_millisecond_datatype(),
                ConcreteDataType::duration_millisecond_datatype(),
                ConcreteDataType::interval_month_day_nano_datatype(),
                ConcreteDataType::decimal128_default_datatype(),
            ],
            vec![
                Value::Boolean(true),
                Value::Int8(8),
                Value::UInt8(8),
                Value::Int16(16),
                Value::UInt16(16),
                Value::Int32(32),
                Value::UInt32(32),
                Value::Int64(64),
                Value::UInt64(64),
                Value::Float32(1.0.into()),
                Value::Float64(1.0.into()),
                Value::Binary(b"hello"[..].into()),
                Value::String("world".into()),
                Value::Date(Date::new(10)),
                Value::DateTime(DateTime::new(11)),
                Value::Timestamp(Timestamp::new_millisecond(12)),
                Value::Time(Time::new_millisecond(13)),
                Value::Duration(Duration::new_millisecond(14)),
                Value::Interval(Interval::from_month_day_nano(1, 1, 15)),
                Value::Decimal128(Decimal128::from(16)),
            ],
        );
    }
 }
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -219,25 +219,14 @@ pub(crate) fn extract_data_batch(batch: &DataBatch) -> (u16, Vec<(i64, u64)>) {
 /// Builds key values with timestamps (ms) and sequences for test.
 pub(crate) fn build_key_values_with_ts_seq_values(
-    schema: &RegionMetadataRef,
+    metadata: &RegionMetadataRef,
    k0: String,
    k1: u32,
    timestamps: impl Iterator<Item = i64>,
    values: impl Iterator<Item = Option<f64>>,
    sequence: SequenceNumber,
 ) -> KeyValues {
-    let column_schema = schema
+    let column_schema = region_metadata_to_row_schema(metadata);
        .column_metadatas
        .iter()
        .map(|c| api::v1::ColumnSchema {
            column_name: c.column_schema.name.clone(),
            datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
                .unwrap()
                .datatype() as i32,
            semantic_type: c.semantic_type as i32,
            ..Default::default()
        })
        .collect();
    let rows = timestamps
        .zip(values)
@@ -269,7 +258,23 @@ pub(crate) fn build_key_values_with_ts_seq_values(
            rows,
        }),
    };
-    KeyValues::new(schema.as_ref(), mutation).unwrap()
+    KeyValues::new(metadata.as_ref(), mutation).unwrap()
 }
 /// Converts the region metadata to column schemas for a row.
 pub fn region_metadata_to_row_schema(metadata: &RegionMetadataRef) -> Vec<api::v1::ColumnSchema> {
    metadata
        .column_metadatas
        .iter()
        .map(|c| api::v1::ColumnSchema {
            column_name: c.column_schema.name.clone(),
            datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
                .unwrap()
                .datatype() as i32,
            semantic_type: c.semantic_type as i32,
            ..Default::default()
        })
        .collect()
 }
 /// Encode keys.
--- a/src/object-store/Cargo.toml
+++ b/src/object-store/Cargo.toml
@@ -18,7 +18,7 @@ futures.workspace = true
 lazy_static.workspace = true
 md5 = "0.7"
 moka = { workspace = true, features = ["future"] }
-opendal = { version = "0.44", features = [
+opendal = { version = "0.45", features = [
    "layers-tracing",
 ] }
 prometheus.workspace = true
--- a/src/operator/src/delete.rs
+++ b/src/operator/src/delete.rs
@@ -91,7 +91,8 @@ impl Deleter {
        .await?;
        let affected_rows = self.do_request(deletes, &ctx).await?;
-        Ok(Output::AffectedRows(affected_rows as _))
+
        Ok(Output::new_with_affected_rows(affected_rows))
    }
    pub async fn handle_table_delete(
--- a/src/operator/src/insert.rs
+++ b/src/operator/src/insert.rs
@@ -111,7 +111,7 @@ impl Inserter {
        .await?;
        let affected_rows = self.do_request(inserts, &ctx).await?;
-        Ok(Output::AffectedRows(affected_rows as _))
+        Ok(Output::new_with_affected_rows(affected_rows))
    }
    /// Handle row inserts request with metric engine.
@@ -149,7 +149,7 @@ impl Inserter {
                .await?;
        let affected_rows = self.do_request(inserts, &ctx).await?;
-        Ok(Output::AffectedRows(affected_rows as _))
+        Ok(Output::new_with_affected_rows(affected_rows))
    }
    pub async fn handle_table_insert(
@@ -185,7 +185,7 @@ impl Inserter {
                .await?;
        let affected_rows = self.do_request(inserts, ctx).await?;
-        Ok(Output::AffectedRows(affected_rows as _))
+        Ok(Output::new_with_affected_rows(affected_rows))
    }
 }
@@ -468,8 +468,6 @@ impl Inserter {
                    &req.table_name,
                );
                info!("Logical table `{table_ref}` does not exist, try creating table");
                let request_schema = req.rows.as_ref().unwrap().schema.as_slice();
                let mut create_table_expr = build_create_table_expr(&table_ref, request_schema)?;
--- a/src/operator/src/statement.rs
+++ b/src/operator/src/statement.rs
@@ -40,12 +40,13 @@ use query::plan::LogicalPlan;
 use query::QueryEngineRef;
 use session::context::QueryContextRef;
 use session::table_name::table_idents_to_full_name;
-use snafu::{OptionExt, ResultExt};
+use snafu::{ensure, OptionExt, ResultExt};
 use sql::statements::copy::{CopyDatabase, CopyDatabaseArgument, CopyTable, CopyTableArgument};
 use sql::statements::set_variables::SetVariables;
 use sql::statements::statement::Statement;
 use sql::statements::OptionMap;
 use sql::util::format_raw_object_name;
-use sqlparser::ast::{Expr, ObjectName, Value};
+use sqlparser::ast::{Expr, Ident, ObjectName, Value};
 use table::requests::{CopyDatabaseRequest, CopyDirection, CopyTableRequest};
 use table::table_reference::TableReference;
 use table::TableRef;
@@ -122,11 +123,11 @@ impl StatementExecutor {
                    CopyDirection::Export => self
                        .copy_table_to(req, query_ctx)
                        .await
-                        .map(Output::AffectedRows),
+                        .map(Output::new_with_affected_rows),
                    CopyDirection::Import => self
                        .copy_table_from(req, query_ctx)
                        .await
-                        .map(Output::AffectedRows),
+                        .map(Output::new_with_affected_rows),
                }
            }
@@ -151,15 +152,15 @@ impl StatementExecutor {
            Statement::CreateTable(stmt) => {
                let _ = self.create_table(stmt, query_ctx).await?;
-                Ok(Output::AffectedRows(0))
+                Ok(Output::new_with_affected_rows(0))
            }
            Statement::CreateTableLike(stmt) => {
                let _ = self.create_table_like(stmt, query_ctx).await?;
-                Ok(Output::AffectedRows(0))
+                Ok(Output::new_with_affected_rows(0))
            }
            Statement::CreateExternalTable(stmt) => {
                let _ = self.create_external_table(stmt, query_ctx).await?;
-                Ok(Output::AffectedRows(0))
+                Ok(Output::new_with_affected_rows(0))
            }
            Statement::Alter(alter_table) => self.alter_table(alter_table, query_ctx).await,
            Statement::DropTable(stmt) => {
@@ -207,6 +208,22 @@ impl StatementExecutor {
                let var_name = set_var.variable.to_string().to_uppercase();
                match var_name.as_str() {
                    "TIMEZONE" | "TIME_ZONE" => set_timezone(set_var.value, query_ctx)?,
                    // Some postgresql client app may submit a "SET bytea_output" stmt upon connection.
                    // However, currently we lack the support for it (tracked in https://github.com/GreptimeTeam/greptimedb/issues/3438),
                    // so we just ignore it here instead of returning an error to break the connection.
                    // Since the "bytea_output" only determines the output format of binary values,
                    // it won't cause much trouble if we do so.
                    // TODO(#3438): Remove this temporary workaround after the feature is implemented.
                    "BYTEA_OUTPUT" => (),
                    // Same as "bytea_output", we just ignore it here.
                    // Not harmful since it only relates to how date is viewed in client app's output.
                    // The tracked issue is https://github.com/GreptimeTeam/greptimedb/issues/3442.
                    // TODO(#3442): Remove this temporary workaround after the feature is implemented.
                    "DATESTYLE" => (),
                    "CLIENT_ENCODING" => validate_client_encoding(set_var)?,
                    _ => {
                        return NotSupportedSnafu {
                            feat: format!("Unsupported set variable {}", var_name),
@@ -214,7 +231,7 @@ impl StatementExecutor {
                        .fail()
                    }
                }
-                Ok(Output::AffectedRows(0))
+                Ok(Output::new_with_affected_rows(0))
            }
            Statement::ShowVariables(show_variable) => self.show_variable(show_variable, query_ctx),
        }
@@ -257,6 +274,39 @@ impl StatementExecutor {
    }
 }
 fn validate_client_encoding(set: SetVariables) -> Result<()> {
    let Some((encoding, [])) = set.value.split_first() else {
        return InvalidSqlSnafu {
            err_msg: "must provide one and only one client encoding value",
        }
        .fail();
    };
    let encoding = match encoding {
        Expr::Value(Value::SingleQuotedString(x))
        | Expr::Identifier(Ident {
            value: x,
            quote_style: _,
        }) => x.to_uppercase(),
        _ => {
            return InvalidSqlSnafu {
                err_msg: format!("client encoding must be a string, actual: {:?}", encoding),
            }
            .fail();
        }
    };
    // For the sake of simplicity, we only support "UTF8" ("UNICODE" is the alias for it,
    // see https://www.postgresql.org/docs/current/multibyte.html#MULTIBYTE-CHARSET-SUPPORTED).
    // "UTF8" is universal and sufficient for almost all cases.
    // GreptimeDB itself is always using "UTF8" as the internal encoding.
    ensure!(
        encoding == "UTF8" || encoding == "UNICODE",
        NotSupportedSnafu {
            feat: format!("client encoding of '{}'", encoding)
        }
    );
    Ok(())
 }
 fn set_timezone(exprs: Vec<Expr>, ctx: QueryContextRef) -> Result<()> {
    let tz_expr = exprs.first().context(NotSupportedSnafu {
        feat: "No timezone find in set variable statement",
--- a/src/operator/src/statement/copy_database.rs
+++ b/src/operator/src/statement/copy_database.rs
@@ -15,10 +15,10 @@
 use std::path::Path;
 use std::str::FromStr;
 use client::Output;
 use common_datasource::file_format::Format;
 use common_datasource::lister::{Lister, Source};
 use common_datasource::object_store::build_backend;
 use common_query::Output;
 use common_telemetry::{debug, error, info, tracing};
 use object_store::Entry;
 use regex::Regex;
@@ -96,7 +96,7 @@ impl StatementExecutor {
                .await?;
            exported_rows += exported;
        }
-        Ok(Output::AffectedRows(exported_rows))
+        Ok(Output::new_with_affected_rows(exported_rows))
    }
    /// Imports data to database from a given location and returns total rows imported.
@@ -169,7 +169,7 @@ impl StatementExecutor {
                }
            }
        }
-        Ok(Output::AffectedRows(rows_inserted))
+        Ok(Output::new_with_affected_rows(rows_inserted))
    }
 }
--- a/src/operator/src/statement/copy_table_to.rs
+++ b/src/operator/src/statement/copy_table_to.rs
@@ -14,6 +14,7 @@
 use std::sync::Arc;
 use client::OutputData;
 use common_base::readable_size::ReadableSize;
 use common_datasource::file_format::csv::stream_to_csv;
 use common_datasource::file_format::json::stream_to_json;
@@ -21,7 +22,6 @@ use common_datasource::file_format::parquet::stream_to_parquet;
 use common_datasource::file_format::Format;
 use common_datasource::object_store::{build_backend, parse_url};
 use common_datasource::util::find_dir_and_filename;
 use common_query::Output;
 use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
 use common_recordbatch::SendableRecordBatchStream;
 use common_telemetry::{debug, tracing};
@@ -134,9 +134,9 @@ impl StatementExecutor {
            .execute(LogicalPlan::DfPlan(plan), query_ctx)
            .await
            .context(ExecLogicalPlanSnafu)?;
-        let stream = match output {
+        let stream = match output.data {
-            Output::Stream(stream, _) => stream,
+            OutputData::Stream(stream) => stream,
-            Output::RecordBatches(record_batches) => record_batches.as_stream(),
+            OutputData::RecordBatches(record_batches) => record_batches.as_stream(),
            _ => unreachable!(),
        };
--- a/src/operator/src/statement/ddl.rs
+++ b/src/operator/src/statement/ddl.rs
@@ -338,10 +338,10 @@ impl StatementExecutor {
                .await
                .context(error::InvalidateTableCacheSnafu)?;
-            Ok(Output::AffectedRows(0))
+            Ok(Output::new_with_affected_rows(0))
        } else if drop_if_exists {
            // DROP TABLE IF EXISTS meets table not found - ignored
-            Ok(Output::AffectedRows(0))
+            Ok(Output::new_with_affected_rows(0))
        } else {
            Err(TableNotFoundSnafu {
                table_name: table_name.to_string(),
@@ -367,7 +367,7 @@ impl StatementExecutor {
        let table_id = table.table_info().table_id();
        self.truncate_table_procedure(&table_name, table_id).await?;
-        Ok(Output::AffectedRows(0))
+        Ok(Output::new_with_affected_rows(0))
    }
    fn verify_alter(
@@ -471,7 +471,7 @@ impl StatementExecutor {
            .await
            .context(error::InvalidateTableCacheSnafu)?;
-        Ok(Output::AffectedRows(0))
+        Ok(Output::new_with_affected_rows(0))
    }
    async fn create_table_procedure(
@@ -580,7 +580,7 @@ impl StatementExecutor {
        if exists {
            return if create_if_not_exists {
-                Ok(Output::AffectedRows(1))
+                Ok(Output::new_with_affected_rows(1))
            } else {
                error::SchemaExistsSnafu { name: database }.fail()
            };
@@ -592,7 +592,7 @@ impl StatementExecutor {
            .await
            .context(TableMetadataManagerSnafu)?;
-        Ok(Output::AffectedRows(1))
+        Ok(Output::new_with_affected_rows(1))
    }
 }
--- a/src/promql/src/functions/extrapolate_rate.rs
+++ b/src/promql/src/functions/extrapolate_rate.rs
@@ -429,7 +429,7 @@ mod test {
            ts_range,
            value_range,
            timestamps,
-            // that two `2.0` is because `duration_to_start` are shrunk to to
+            // that two `2.0` is because `duration_to_start` are shrunk to
            // `duration_to_zero`, and causes `duration_to_zero` less than
            // `extrapolation_threshold`.
            vec![2.0, 1.5, 1.5, 1.5, 2.0, 1.5, 1.5, 1.5],
--- a/src/query/src/datafusion.rs
+++ b/src/query/src/datafusion.rs
@@ -28,7 +28,7 @@ use common_function::function::FunctionRef;
 use common_function::scalars::aggregate::AggregateFunctionMetaRef;
 use common_query::physical_plan::{DfPhysicalPlanAdapter, PhysicalPlan, PhysicalPlanAdapter};
 use common_query::prelude::ScalarUdf;
-use common_query::Output;
+use common_query::{Output, OutputData, OutputMeta};
 use common_recordbatch::adapter::RecordBatchStreamAdapter;
 use common_recordbatch::{EmptyRecordBatchStream, SendableRecordBatchStream};
 use common_telemetry::tracing;
@@ -90,9 +90,9 @@ impl DatafusionQueryEngine {
            optimized_physical_plan
        };
-        Ok(Output::Stream(
+        Ok(Output::new(
-            self.execute_stream(&ctx, &physical_plan)?,
+            OutputData::Stream(self.execute_stream(&ctx, &physical_plan)?),
-            Some(physical_plan),
+            OutputMeta::new_with_plan(physical_plan),
        ))
    }
@@ -121,9 +121,9 @@ impl DatafusionQueryEngine {
        let output = self
            .exec_query_plan(LogicalPlan::DfPlan((*dml.input).clone()), query_ctx.clone())
            .await?;
-        let mut stream = match output {
+        let mut stream = match output.data {
-            Output::RecordBatches(batches) => batches.as_stream(),
+            OutputData::RecordBatches(batches) => batches.as_stream(),
-            Output::Stream(stream, _) => stream,
+            OutputData::Stream(stream) => stream,
            _ => unreachable!(),
        };
@@ -148,7 +148,7 @@ impl DatafusionQueryEngine {
            };
            affected_rows += rows;
        }
-        Ok(Output::AffectedRows(affected_rows))
+        Ok(Output::new_with_affected_rows(affected_rows))
    }
    #[tracing::instrument(skip_all)]
@@ -471,7 +471,6 @@ mod tests {
    use catalog::RegisterTableRequest;
    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, NUMBERS_TABLE_ID};
    use common_query::Output;
    use common_recordbatch::util;
    use datafusion::prelude::{col, lit};
    use datatypes::prelude::ConcreteDataType;
@@ -534,8 +533,8 @@ mod tests {
        let output = engine.execute(plan, QueryContext::arc()).await.unwrap();
-        match output {
+        match output.data {
-            Output::Stream(recordbatch, _) => {
+            OutputData::Stream(recordbatch) => {
                let numbers = util::collect(recordbatch).await.unwrap();
                assert_eq!(1, numbers.len());
                assert_eq!(numbers[0].num_columns(), 1);
--- a/src/query/src/metrics.rs
+++ b/src/query/src/metrics.rs
@@ -15,7 +15,8 @@
 use std::pin::Pin;
 use std::task::{Context, Poll};
-use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
+use common_recordbatch::adapter::RecordBatchMetrics;
 use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
 use datatypes::schema::SchemaRef;
 use futures::Stream;
 use futures_util::ready;
@@ -78,6 +79,14 @@ impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
    fn schema(&self) -> SchemaRef {
        self.stream.schema()
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        self.stream.output_ordering()
    }
    fn metrics(&self) -> Option<RecordBatchMetrics> {
        self.stream.metrics()
    }
 }
 impl<F: FnOnce() + Unpin> Stream for OnDone<F> {
--- a/src/query/src/range_select/plan.rs
+++ b/src/query/src/range_select/plan.rs
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use std::collections::hash_map::Entry;
+use std::any::Any;
-use std::collections::HashMap;
+use std::cmp::Ordering;
 use std::collections::btree_map::Entry;
 use std::collections::{BTreeMap, HashMap};
 use std::fmt::Display;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -21,8 +23,8 @@ use std::task::{Context, Poll};
 use std::time::Duration;
 use ahash::RandomState;
-use arrow::compute::{self, cast_with_options, CastOptions};
+use arrow::compute::{self, cast_with_options, CastOptions, SortColumn};
-use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow_schema::{DataType, Field, Schema, SchemaRef, SortOptions, TimeUnit};
 use common_query::DfPhysicalPlan;
 use common_recordbatch::DfSendableRecordBatchStream;
 use datafusion::common::{Result as DataFusionResult, Statistics};
@@ -35,10 +37,14 @@ use datafusion::physical_plan::{
    SendableRecordBatchStream,
 };
 use datafusion::physical_planner::create_physical_sort_expr;
-use datafusion_common::utils::get_arrayref_at_indices;
+use datafusion_common::utils::{get_arrayref_at_indices, get_row_at_idx};
 use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, ScalarValue};
-use datafusion_expr::utils::exprlist_to_fields;
+use datafusion_expr::utils::{exprlist_to_fields, COUNT_STAR_EXPANSION};
-use datafusion_expr::{Accumulator, Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion_expr::{
    lit, Accumulator, AggregateFunction, Expr, ExprSchemable, LogicalPlan,
    UserDefinedLogicalNodeCore,
 };
 use datafusion_physical_expr::aggregate::utils::down_cast_any_ref;
 use datafusion_physical_expr::expressions::create_aggregate_expr as create_aggr_expr;
 use datafusion_physical_expr::hash_utils::create_hashes;
 use datafusion_physical_expr::{
@@ -58,6 +64,140 @@ use crate::error::{DataFusionSnafu, RangeQuerySnafu, Result};
 type Millisecond = <TimestampMillisecondType as ArrowPrimitiveType>::Native;
 /// Implementation of `first_value`/`last_value`
 /// aggregate function adapted to range query
 #[derive(Debug)]
 struct RangeFirstListValue {
    /// calculate expr
    expr: Arc<dyn PhysicalExpr>,
    order_bys: Vec<PhysicalSortExpr>,
 }
 impl RangeFirstListValue {
    pub fn new_aggregate_expr(
        expr: Arc<dyn PhysicalExpr>,
        order_bys: Vec<PhysicalSortExpr>,
    ) -> Arc<dyn AggregateExpr> {
        Arc::new(Self { expr, order_bys })
    }
 }
 impl PartialEq<dyn Any> for RangeFirstListValue {
    fn eq(&self, other: &dyn Any) -> bool {
        down_cast_any_ref(other)
            .downcast_ref::<Self>()
            .map(|x| self.expr.eq(&x.expr) && self.order_bys.iter().eq(x.order_bys.iter()))
            .unwrap_or(false)
    }
 }
 impl AggregateExpr for RangeFirstListValue {
    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
    fn create_accumulator(&self) -> DataFusionResult<Box<dyn Accumulator>> {
        Ok(Box::new(RangeFirstListValueAcc::new(
            self.order_bys.iter().map(|order| order.options).collect(),
        )))
    }
    fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
        let mut exprs: Vec<_> = self
            .order_bys
            .iter()
            .map(|order| order.expr.clone())
            .collect();
        exprs.push(self.expr.clone());
        exprs
    }
    fn field(&self) -> DataFusionResult<Field> {
        unreachable!("AggregateExpr::field will not be used in range query")
    }
    fn state_fields(&self) -> DataFusionResult<Vec<Field>> {
        unreachable!("AggregateExpr::state_fields will not be used in range query")
    }
 }
 #[derive(Debug)]
 pub struct RangeFirstListValueAcc {
    pub sort_options: Vec<SortOptions>,
    pub sort_columns: Vec<ScalarValue>,
    pub data: Option<ScalarValue>,
 }
 impl RangeFirstListValueAcc {
    pub fn new(sort_options: Vec<SortOptions>) -> Self {
        Self {
            sort_options,
            sort_columns: vec![],
            data: None,
        }
    }
 }
 impl Accumulator for RangeFirstListValueAcc {
    fn update_batch(&mut self, values: &[ArrayRef]) -> DataFusionResult<()> {
        let columns: Vec<_> = values
            .iter()
            .zip(self.sort_options.iter())
            .map(|(v, s)| SortColumn {
                values: v.clone(),
                options: Some(*s),
            })
            .collect();
        // finding the Top1 problem with complexity O(n)
        let idx = compute::lexsort_to_indices(&columns, Some(1))?.value(0);
        let vs = get_row_at_idx(values, idx as usize)?;
        let need_update = self.data.is_none()
            || vs
                .iter()
                .zip(self.sort_columns.iter())
                .zip(self.sort_options.iter())
                .find_map(|((new_value, old_value), sort_option)| {
                    if new_value.is_null() && old_value.is_null() {
                        None
                    } else if sort_option.nulls_first
                        && (new_value.is_null() || old_value.is_null())
                    {
                        Some(new_value.is_null())
                    } else {
                        new_value.partial_cmp(old_value).map(|x| {
                            (x == Ordering::Greater && sort_option.descending)
                                || (x == Ordering::Less && !sort_option.descending)
                        })
                    }
                })
                .unwrap_or(false);
        if need_update {
            self.sort_columns = vs;
            self.data = Some(ScalarValue::try_from_array(
                &values[self.sort_options.len()],
                idx as usize,
            )?);
        }
        Ok(())
    }
    fn evaluate(&self) -> DataFusionResult<ScalarValue> {
        Ok(self.data.clone().unwrap_or(ScalarValue::Null))
    }
    fn size(&self) -> usize {
        std::mem::size_of_val(self)
    }
    fn state(&self) -> DataFusionResult<Vec<ScalarValue>> {
        unreachable!("Accumulator::state will not be used in range query")
    }
    fn merge_batch(&mut self, _states: &[ArrayRef]) -> DataFusionResult<()> {
        unreachable!("Accumulator::merge_batch will not be used in range query")
    }
 }
 #[derive(PartialEq, Eq, Debug, Hash, Clone)]
 pub enum Fill {
    Null,
@@ -78,14 +218,15 @@ impl Display for Fill {
 }
 impl Fill {
-    pub fn try_from_str(value: &str, datatype: &DataType) -> DfResult<Self> {
+    pub fn try_from_str(value: &str, datatype: &DataType) -> DfResult<Option<Self>> {
        let s = value.to_uppercase();
        match s.as_str() {
-            "NULL" | "" => Ok(Self::Null),
+            "" => Ok(None),
-            "PREV" => Ok(Self::Prev),
+            "NULL" => Ok(Some(Self::Null)),
            "PREV" => Ok(Some(Self::Prev)),
            "LINEAR" => {
                if datatype.is_numeric() {
-                    Ok(Self::Linear)
+                    Ok(Some(Self::Linear))
                } else {
                    Err(DataFusionError::Plan(format!(
                        "Use FILL LINEAR on Non-numeric DataType {}",
@@ -100,13 +241,17 @@ impl Fill {
                        s, err
                    ))
                })
-                .map(Fill::Const),
+                .map(|x| Some(Fill::Const(x))),
        }
    }
    /// The input `data` contains data on a complete time series.
    /// If the filling strategy is `PREV` or `LINEAR`, caller must be ensured that the incoming `ts`&`data` is ascending time order.
    pub fn apply_fill_strategy(&self, ts: &[i64], data: &mut [ScalarValue]) -> DfResult<()> {
        // No calculation need in `Fill::Null`
        if matches!(self, Fill::Null) {
            return Ok(());
        }
        let len = data.len();
        if *self == Fill::Linear {
            return Self::fill_linear(ts, data);
@@ -114,7 +259,6 @@ impl Fill {
        for i in 0..len {
            if data[i].is_null() {
                match self {
                    Fill::Null => continue,
                    Fill::Prev => {
                        if i != 0 {
                            data[i] = data[i - 1].clone()
@@ -122,7 +266,8 @@ impl Fill {
                    }
                    // The calculation of linear interpolation is relatively complicated.
                    // `Self::fill_linear` is used to dispose `Fill::Linear`.
-                    Fill::Linear => unreachable!(),
+                    // No calculation need in `Fill::Null`
                    Fill::Linear | Fill::Null => unreachable!(),
                    Fill::Const(v) => data[i] = v.clone(),
                }
            }
@@ -219,12 +364,12 @@ fn linear_interpolation(
 #[derive(Eq, Clone, Debug)]
 pub struct RangeFn {
-    /// with format like `max(a) RANGE 300s FILL NULL`
+    /// with format like `max(a) RANGE 300s [FILL NULL]`
    pub name: String,
    pub data_type: DataType,
    pub expr: Expr,
    pub range: Duration,
-    pub fill: Fill,
+    pub fill: Option<Fill>,
    /// If the `FIll` strategy is `Linear` and the output is an integer,
    /// it is possible to calculate a floating point number.
    /// So for `FILL==LINEAR`, the entire data will be implicitly converted to Float type
@@ -271,6 +416,7 @@ pub struct RangeSelect {
    pub align: Duration,
    pub align_to: i64,
    pub time_index: String,
    pub time_expr: Expr,
    pub by: Vec<Expr>,
    pub schema: DFSchemaRef,
    pub by_schema: DFSchemaRef,
@@ -324,7 +470,7 @@ impl RangeSelect {
                        name,
                        data_type.clone(),
                        // Only when data fill with Const option, the data can't be null
-                        !matches!(fill, Fill::Const(..)),
+                        !matches!(fill, Some(Fill::Const(..))),
                    ))
                },
            )
@@ -382,6 +528,7 @@ impl RangeSelect {
            align,
            align_to,
            time_index: time_index_name,
            time_expr: time_index,
            schema,
            by_schema,
            by,
@@ -440,6 +587,7 @@ impl UserDefinedLogicalNodeCore for RangeSelect {
            range_expr: self.range_expr.clone(),
            input: Arc::new(inputs[0].clone()),
            time_index: self.time_index.clone(),
            time_expr: self.time_expr.clone(),
            schema: self.schema.clone(),
            by: self.by.clone(),
            by_schema: self.by_schema.clone(),
@@ -452,6 +600,7 @@ impl UserDefinedLogicalNodeCore for RangeSelect {
 impl RangeSelect {
    fn create_physical_expr_list(
        &self,
        is_count_aggr: bool,
        exprs: &[Expr],
        df_schema: &Arc<DFSchema>,
        schema: &Schema,
@@ -459,7 +608,20 @@ impl RangeSelect {
    ) -> DfResult<Vec<Arc<dyn PhysicalExpr>>> {
        exprs
            .iter()
-            .map(|by| create_physical_expr(by, df_schema, schema, session_state.execution_props()))
+            .map(|e| match e {
                // `count(*)` will be rewritten by `CountWildcardRule` into `count(1)` when optimizing logical plan.
                // The modification occurs after range plan rewrite.
                // At this time, aggregate plan has been replaced by a custom range plan,
                // so `CountWildcardRule` has not been applied.
                // We manually modify it when creating the physical plan.
                Expr::Wildcard if is_count_aggr => create_physical_expr(
                    &lit(COUNT_STAR_EXPANSION),
                    df_schema,
                    schema,
                    session_state.execution_props(),
                ),
                _ => create_physical_expr(e, df_schema, schema, session_state.execution_props()),
            })
            .collect::<DfResult<Vec<_>>>()
    }
@@ -488,6 +650,72 @@ impl RangeSelect {
            .iter()
            .map(|range_fn| {
                let expr = match &range_fn.expr {
                    Expr::AggregateFunction(aggr)
                        if aggr.fun == AggregateFunction::FirstValue
                            || aggr.fun == AggregateFunction::LastValue =>
                    {
                        // Because we only need to find the first_value/last_value,
                        // the complexity of sorting the entire batch is O(nlogn).
                        // We can sort the batch with limit 1.
                        // In this case, the algorithm degenerates into finding the Top1 problem with complexity O(n).
                        // We need reverse the sort order of last_value to correctly apply limit 1 when sorting.
                        let order_by = if let Some(exprs) = &aggr.order_by {
                            exprs
                                .iter()
                                .map(|x| {
                                    create_physical_sort_expr(
                                        x,
                                        input_dfschema,
                                        &input_schema,
                                        session_state.execution_props(),
                                    )
                                    .map(|expr| {
                                        // reverse the last_value sort
                                        if aggr.fun == AggregateFunction::LastValue {
                                            PhysicalSortExpr {
                                                expr: expr.expr,
                                                options: SortOptions {
                                                    descending: !expr.options.descending,
                                                    nulls_first: !expr.options.nulls_first,
                                                },
                                            }
                                        } else {
                                            expr
                                        }
                                    })
                                })
                                .collect::<DfResult<Vec<_>>>()?
                        } else {
                            // if user not assign order by, time index is needed as default ordering
                            let time_index = create_physical_expr(
                                &self.time_expr,
                                input_dfschema,
                                &input_schema,
                                session_state.execution_props(),
                            )?;
                            vec![PhysicalSortExpr {
                                expr: time_index,
                                options: SortOptions {
                                    descending: aggr.fun == AggregateFunction::LastValue,
                                    nulls_first: false,
                                },
                            }]
                        };
                        let arg = self.create_physical_expr_list(
                            false,
                            &aggr.args,
                            input_dfschema,
                            &input_schema,
                            session_state,
                        )?;
                        // first_value/last_value has only one param.
                        // The param have been checked by datafusion in logical plan stage.
                        // We can safely assume that there is only one element here.
                        Ok(RangeFirstListValue::new_aggregate_expr(
                            arg[0].clone(),
                            order_by,
                        ))
                    }
                    Expr::AggregateFunction(aggr) => {
                        let order_by = if let Some(exprs) = &aggr.order_by {
                            exprs
@@ -508,6 +736,7 @@ impl RangeSelect {
                            &aggr.fun,
                            false,
                            &self.create_physical_expr_list(
                                aggr.fun == AggregateFunction::Count,
                                &aggr.args,
                                input_dfschema,
                                &input_schema,
@@ -523,6 +752,7 @@ impl RangeSelect {
                        let expr = create_aggr_udf_expr(
                            &aggr_udf.fun,
                            &self.create_physical_expr_list(
                                false,
                                &aggr_udf.args,
                                input_dfschema,
                                &input_schema,
@@ -564,6 +794,7 @@ impl RangeSelect {
            align: self.align.as_millis() as Millisecond,
            align_to: self.align_to,
            by: self.create_physical_expr_list(
                false,
                &self.by,
                input_dfschema,
                &input_schema,
@@ -584,10 +815,26 @@ struct RangeFnExec {
    pub expr: Arc<dyn AggregateExpr>,
    pub args: Vec<Arc<dyn PhysicalExpr>>,
    pub range: Millisecond,
-    pub fill: Fill,
+    pub fill: Option<Fill>,
    pub need_cast: Option<DataType>,
 }
 impl Display for RangeFnExec {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if let Some(fill) = &self.fill {
            write!(
                f,
                "{} RANGE {}s FILL {}",
                self.expr.name(),
                self.range / 1000,
                fill
            )
        } else {
            write!(f, "{} RANGE {}s", self.expr.name(), self.range / 1000)
        }
    }
 }
 #[derive(Debug)]
 pub struct RangeSelectExec {
    input: Arc<dyn ExecutionPlan>,
@@ -608,18 +855,8 @@ impl DisplayAs for RangeSelectExec {
        match t {
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
                write!(f, "RangeSelectExec: ")?;
-                let range_expr_strs: Vec<String> = self
+                let range_expr_strs: Vec<String> =
-                    .range_exec
+                    self.range_exec.iter().map(RangeFnExec::to_string).collect();
                    .iter()
                    .map(|e| {
                        format!(
                            "{} RANGE {}s FILL {}",
                            e.expr.name(),
                            e.range / 1000,
                            e.fill
                        )
                    })
                    .collect();
                let by: Vec<String> = self.by.iter().map(|e| e.to_string()).collect();
                write!(
                    f,
@@ -713,7 +950,7 @@ impl ExecutionPlan for RangeSelectExec {
            by: self.by.clone(),
            series_map: HashMap::new(),
            exec_state: ExecutionState::ReadingInput,
-            output_num_rows: 0,
+            num_not_null_rows: 0,
            row_converter,
            modify_map: HashMap::new(),
            metric: baseline_metric,
@@ -753,8 +990,8 @@ struct RangeSelectStream {
    /// value: `[row_ids]`
    /// It is used to record the data that needs to be aggregated in each time slot during the data update process
    modify_map: HashMap<(u64, Millisecond), Vec<u32>>,
-    /// The number of rows of the final output
+    /// The number of rows of not null rows in the final output
-    output_num_rows: usize,
+    num_not_null_rows: usize,
    metric: BaselineMetrics,
    schema_project: Option<Vec<usize>>,
    schema_before_project: SchemaRef,
@@ -766,7 +1003,7 @@ struct SeriesState {
    row: OwnedRow,
    /// key: align_ts
    /// value: a vector, each element is a range_fn follow the order of `range_exec`
-    align_ts_accumulator: HashMap<Millisecond, Vec<Box<dyn Accumulator>>>,
+    align_ts_accumulator: BTreeMap<Millisecond, Vec<Box<dyn Accumulator>>>,
 }
 /// Use `align_to` as time origin.
@@ -882,7 +1119,7 @@ impl RangeSelectStream {
                    let accumulators_map =
                        self.series_map.entry(*hash).or_insert_with(|| SeriesState {
                            row: by_rows.row(*row as usize).owned(),
-                            align_ts_accumulator: HashMap::new(),
+                            align_ts_accumulator: BTreeMap::new(),
                        });
                    match accumulators_map.align_ts_accumulator.entry(*ts) {
                        Entry::Occupied(mut e) => {
@@ -890,7 +1127,7 @@ impl RangeSelectStream {
                            accumulators[i].update_batch(&sliced_arrays)
                        }
                        Entry::Vacant(e) => {
-                            self.output_num_rows += 1;
+                            self.num_not_null_rows += 1;
                            let mut accumulators = self
                                .range_exec
                                .iter()
@@ -915,29 +1152,47 @@ impl RangeSelectStream {
        // 1 for time index column
        let mut columns: Vec<Arc<dyn Array>> =
            Vec::with_capacity(1 + self.range_exec.len() + self.by.len());
-        let mut ts_builder = TimestampMillisecondBuilder::with_capacity(self.output_num_rows);
+        let mut ts_builder = TimestampMillisecondBuilder::with_capacity(self.num_not_null_rows);
-        let mut all_scalar = vec![Vec::with_capacity(self.output_num_rows); self.range_exec.len()];
+        let mut all_scalar =
-        let mut by_rows = Vec::with_capacity(self.output_num_rows);
+            vec![Vec::with_capacity(self.num_not_null_rows); self.range_exec.len()];
        let mut by_rows = Vec::with_capacity(self.num_not_null_rows);
        let mut start_index = 0;
-        // RangePlan is calculated on a row basis. If a column uses the PREV or LINEAR filling strategy,
+        // If any range expr need fill, we need fill both the missing align_ts and null value.
-        // we must arrange the data in the entire data row to determine the NULL filling value.
+        let need_fill_output = self.range_exec.iter().any(|range| range.fill.is_some());
-        let need_sort_output = self
+        // The padding value for each accumulator
        let padding_values = self
            .range_exec
            .iter()
-            .any(|range| range.fill == Fill::Linear || range.fill == Fill::Prev);
+            .map(|e| e.expr.create_accumulator()?.evaluate())
            .collect::<DfResult<Vec<_>>>()?;
        for SeriesState {
            row,
            align_ts_accumulator,
        } in self.series_map.values()
        {
-            // collect data on time series
+            // skip empty time series
-            let mut align_ts = align_ts_accumulator.keys().copied().collect::<Vec<_>>();
+            if align_ts_accumulator.is_empty() {
-            if need_sort_output {
+                continue;
                align_ts.sort();
            }
            // find the first and last align_ts
            let begin_ts = *align_ts_accumulator.first_key_value().unwrap().0;
            let end_ts = *align_ts_accumulator.last_key_value().unwrap().0;
            let align_ts = if need_fill_output {
                // we need to fill empty align_ts which not data in that solt
                (begin_ts..=end_ts).step_by(self.align as usize).collect()
            } else {
                align_ts_accumulator.keys().copied().collect::<Vec<_>>()
            };
            for ts in &align_ts {
-                for (i, accumulator) in align_ts_accumulator.get(ts).unwrap().iter().enumerate() {
+                if let Some(slot) = align_ts_accumulator.get(ts) {
-                    all_scalar[i].push(accumulator.evaluate()?);
+                    for (column, acc) in all_scalar.iter_mut().zip(slot.iter()) {
                        column.push(acc.evaluate()?);
                    }
                } else {
                    // fill null in empty time solt
                    for (column, padding) in all_scalar.iter_mut().zip(padding_values.iter()) {
                        column.push(padding.clone())
                    }
                }
            }
            ts_builder.append_slice(&align_ts);
@@ -950,14 +1205,16 @@ impl RangeSelectStream {
            ) in self.range_exec.iter().enumerate()
            {
                let time_series_data =
-                    &mut all_scalar[i][start_index..start_index + align_ts_accumulator.len()];
+                    &mut all_scalar[i][start_index..start_index + align_ts.len()];
                if let Some(data_type) = need_cast {
                    cast_scalar_values(time_series_data, data_type)?;
                }
-                fill.apply_fill_strategy(&align_ts, time_series_data)?;
+                if let Some(fill) = fill {
                    fill.apply_fill_strategy(&align_ts, time_series_data)?;
                }
            }
-            by_rows.resize(by_rows.len() + align_ts_accumulator.len(), row.row());
+            by_rows.resize(by_rows.len() + align_ts.len(), row.row());
-            start_index += align_ts_accumulator.len();
+            start_index += align_ts.len();
        }
        for column_scalar in all_scalar {
            columns.push(ScalarValue::iter_to_array(column_scalar)?);
@@ -1078,7 +1335,7 @@ mod test {
    const TIME_INDEX_COLUMN: &str = "timestamp";
-    fn prepare_test_data(is_float: bool) -> MemoryExec {
+    fn prepare_test_data(is_float: bool, is_gap: bool) -> MemoryExec {
        let schema = Arc::new(Schema::new(vec![
            Field::new(TIME_INDEX_COLUMN, TimestampMillisecondType::DATA_TYPE, true),
            Field::new(
@@ -1092,16 +1349,23 @@ mod test {
            ),
            Field::new("host", DataType::Utf8, true),
        ]));
-        let timestamp_column: Arc<dyn Array> = Arc::new(TimestampMillisecondArray::from(vec![
+        let timestamp_column: Arc<dyn Array> = if !is_gap {
-            0, 5_000, 10_000, 15_000, 20_000, // host 1 every 5s
+            Arc::new(TimestampMillisecondArray::from(vec![
-            0, 5_000, 10_000, 15_000, 20_000, // host 2 every 5s
+                0, 5_000, 10_000, 15_000, 20_000, // host 1 every 5s
-        ])) as _;
+                0, 5_000, 10_000, 15_000, 20_000, // host 2 every 5s
-        let mut host = vec!["host1"; 5];
+            ])) as _
-        host.extend(vec!["host2"; 5]);
+        } else {
-        let value_column: Arc<dyn Array> = if is_float {
+            Arc::new(TimestampMillisecondArray::from(vec![
-            Arc::new(nullable_array!(Float64;
+                0, 15_000, // host 1 every 5s, missing data on 5_000, 10_000
-                0.0, null, 1.0, null, 2.0, // data for host 1
+                0, 15_000, // host 2 every 5s, missing data on 5_000, 10_000
-                3.0, null, 4.0, null, 5.0 // data for host 2
+            ])) as _
        };
        let mut host = vec!["host1"; timestamp_column.len() / 2];
        host.extend(vec!["host2"; timestamp_column.len() / 2]);
        let mut value_column: Arc<dyn Array> = if is_gap {
            Arc::new(nullable_array!(Int64;
                0, 6, // data for host 1
                6, 12 // data for host 2
            )) as _
        } else {
            Arc::new(nullable_array!(Int64;
@@ -1109,6 +1373,11 @@ mod test {
                3, null, 4, null, 5 // data for host 2
            )) as _
        };
        if is_float {
            value_column =
                cast_with_options(&value_column, &DataType::Float64, &CastOptions::default())
                    .unwrap();
        }
        let host_column: Arc<dyn Array> = Arc::new(StringArray::from(host)) as _;
        let data = RecordBatch::try_new(
            schema.clone(),
@@ -1123,8 +1392,9 @@ mod test {
        range1: Millisecond,
        range2: Millisecond,
        align: Millisecond,
-        fill: Fill,
+        fill: Option<Fill>,
        is_float: bool,
        is_gap: bool,
        expected: String,
    ) {
        let data_type = if is_float {
@@ -1132,13 +1402,13 @@ mod test {
        } else {
            DataType::Int64
        };
-        let (need_cast, schema_data_type) = if !is_float && fill == Fill::Linear {
+        let (need_cast, schema_data_type) = if !is_float && matches!(fill, Some(Fill::Linear)) {
            // data_type = DataType::Float64;
            (Some(DataType::Float64), DataType::Float64)
        } else {
            (None, data_type.clone())
        };
-        let memory_exec = Arc::new(prepare_test_data(is_float));
+        let memory_exec = Arc::new(prepare_test_data(is_float, is_gap));
        let schema = Arc::new(Schema::new(vec![
            Field::new("MIN(value)", schema_data_type.clone(), true),
            Field::new("MAX(value)", schema_data_type, true),
@@ -1223,7 +1493,16 @@ mod test {
            \n| 3.0        | 3.0        | 1970-01-01T00:00:00 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
-        do_range_select_test(10_000, 10_000, 1_000_000, Fill::Null, true, expected).await;
+        do_range_select_test(
            10_000,
            10_000,
            1_000_000,
            Some(Fill::Null),
            true,
            false,
            expected,
        )
        .await;
    }
    #[tokio::test]
@@ -1246,7 +1525,16 @@ mod test {
            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
-        do_range_select_test(10_000, 5_000, 5_000, Fill::Null, true, expected).await;
+        do_range_select_test(
            10_000,
            5_000,
            5_000,
            Some(Fill::Null),
            true,
            false,
            expected,
        )
        .await;
    }
    #[tokio::test]
@@ -1269,7 +1557,16 @@ mod test {
            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
-        do_range_select_test(10_000, 5_000, 5_000, Fill::Prev, true, expected).await;
+        do_range_select_test(
            10_000,
            5_000,
            5_000,
            Some(Fill::Prev),
            true,
            false,
            expected,
        )
        .await;
    }
    #[tokio::test]
@@ -1292,7 +1589,16 @@ mod test {
            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
-        do_range_select_test(10_000, 5_000, 5_000, Fill::Linear, true, expected).await;
+        do_range_select_test(
            10_000,
            5_000,
            5_000,
            Some(Fill::Linear),
            true,
            false,
            expected,
        )
        .await;
    }
    #[tokio::test]
@@ -1315,7 +1621,16 @@ mod test {
            \n| 5.0        | 5.0        | 1970-01-01T00:00:20 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
-        do_range_select_test(10_000, 5_000, 5_000, Fill::Linear, false, expected).await;
+        do_range_select_test(
            10_000,
            5_000,
            5_000,
            Some(Fill::Linear),
            false,
            false,
            expected,
        )
        .await;
    }
    #[tokio::test]
@@ -1342,7 +1657,101 @@ mod test {
            10_000,
            5_000,
            5_000,
-            Fill::Const(ScalarValue::Float64(Some(6.6))),
+            Some(Fill::Const(ScalarValue::Float64(Some(6.6)))),
            true,
            false,
            expected,
        )
        .await;
    }
    #[tokio::test]
    async fn range_fill_gap() {
        let expected = String::from(
            "+------------+------------+---------------------+-------+\
            \n| MIN(value) | MAX(value) | timestamp           | host  |\
            \n+------------+------------+---------------------+-------+\
            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:15 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:00 | host2 |\
            \n| 12.0       | 12.0       | 1970-01-01T00:00:15 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
        do_range_select_test(5_000, 5_000, 5_000, None, true, true, expected).await;
        let expected = String::from(
            "+------------+------------+---------------------+-------+\
            \n| MIN(value) | MAX(value) | timestamp           | host  |\
            \n+------------+------------+---------------------+-------+\
            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
            \n|            |            | 1970-01-01T00:00:05 | host1 |\
            \n|            |            | 1970-01-01T00:00:10 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:15 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:00 | host2 |\
            \n|            |            | 1970-01-01T00:00:05 | host2 |\
            \n|            |            | 1970-01-01T00:00:10 | host2 |\
            \n| 12.0       | 12.0       | 1970-01-01T00:00:15 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
        do_range_select_test(5_000, 5_000, 5_000, Some(Fill::Null), true, true, expected).await;
        let expected = String::from(
            "+------------+------------+---------------------+-------+\
            \n| MIN(value) | MAX(value) | timestamp           | host  |\
            \n+------------+------------+---------------------+-------+\
            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
            \n| 0.0        | 0.0        | 1970-01-01T00:00:05 | host1 |\
            \n| 0.0        | 0.0        | 1970-01-01T00:00:10 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:15 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:00 | host2 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:05 | host2 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:10 | host2 |\
            \n| 12.0       | 12.0       | 1970-01-01T00:00:15 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
        do_range_select_test(5_000, 5_000, 5_000, Some(Fill::Prev), true, true, expected).await;
        let expected = String::from(
            "+------------+------------+---------------------+-------+\
            \n| MIN(value) | MAX(value) | timestamp           | host  |\
            \n+------------+------------+---------------------+-------+\
            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
            \n| 2.0        | 2.0        | 1970-01-01T00:00:05 | host1 |\
            \n| 4.0        | 4.0        | 1970-01-01T00:00:10 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:15 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:00 | host2 |\
            \n| 8.0        | 8.0        | 1970-01-01T00:00:05 | host2 |\
            \n| 10.0       | 10.0       | 1970-01-01T00:00:10 | host2 |\
            \n| 12.0       | 12.0       | 1970-01-01T00:00:15 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
        do_range_select_test(
            5_000,
            5_000,
            5_000,
            Some(Fill::Linear),
            true,
            true,
            expected,
        )
        .await;
        let expected = String::from(
            "+------------+------------+---------------------+-------+\
            \n| MIN(value) | MAX(value) | timestamp           | host  |\
            \n+------------+------------+---------------------+-------+\
            \n| 0.0        | 0.0        | 1970-01-01T00:00:00 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:05 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:10 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:15 | host1 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:00 | host2 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:05 | host2 |\
            \n| 6.0        | 6.0        | 1970-01-01T00:00:10 | host2 |\
            \n| 12.0       | 12.0       | 1970-01-01T00:00:15 | host2 |\
            \n+------------+------------+---------------------+-------+",
        );
        do_range_select_test(
            5_000,
            5_000,
            5_000,
            Some(Fill::Const(ScalarValue::Float64(Some(6.0)))),
            true,
            true,
            expected,
        )
@@ -1351,7 +1760,8 @@ mod test {
    #[test]
    fn fill_test() {
-        assert!(Fill::try_from_str("Linear", &DataType::UInt8).unwrap() == Fill::Linear);
+        assert!(Fill::try_from_str("", &DataType::UInt8).unwrap().is_none());
        assert!(Fill::try_from_str("Linear", &DataType::UInt8).unwrap() == Some(Fill::Linear));
        assert_eq!(
            Fill::try_from_str("Linear", &DataType::Boolean)
                .unwrap_err()
@@ -1372,7 +1782,7 @@ mod test {
        );
        assert!(
            Fill::try_from_str("8", &DataType::UInt8).unwrap()
-                == Fill::Const(ScalarValue::UInt8(Some(8)))
+                == Some(Fill::Const(ScalarValue::UInt8(Some(8))))
        );
        let mut test1 = vec![
            ScalarValue::UInt8(Some(8)),
@@ -1447,4 +1857,44 @@ mod test {
        Fill::Linear.apply_fill_strategy(&ts, &mut test1).unwrap();
        assert_eq!(test, test1);
    }
    #[test]
    fn test_fist_last_accumulator() {
        let mut acc = RangeFirstListValueAcc::new(vec![
            SortOptions {
                descending: true,
                nulls_first: false,
            },
            SortOptions {
                descending: false,
                nulls_first: true,
            },
        ]);
        let batch1: Vec<Arc<dyn Array>> = vec![
            Arc::new(nullable_array!(Float64;
                0.0, null, 0.0, null, 1.0
            )),
            Arc::new(nullable_array!(Float64;
                5.0, null, 4.0, null, 3.0
            )),
            Arc::new(nullable_array!(Int64;
                1, 2, 3, 4, 5
            )),
        ];
        let batch2: Vec<Arc<dyn Array>> = vec![
            Arc::new(nullable_array!(Float64;
                3.0, 3.0, 3.0, 3.0, 3.0
            )),
            Arc::new(nullable_array!(Float64;
                null,3.0, 3.0, 3.0, 3.0
            )),
            Arc::new(nullable_array!(Int64;
                6, 7, 8, 9, 10
            )),
        ];
        acc.update_batch(&batch1).unwrap();
        assert_eq!(acc.evaluate().unwrap(), ScalarValue::Int64(Some(5)));
        acc.update_batch(&batch2).unwrap();
        assert_eq!(acc.evaluate().unwrap(), ScalarValue::Int64(Some(6)));
    }
 }
--- a/src/query/src/range_select/plan_rewrite.rs
+++ b/src/query/src/range_select/plan_rewrite.rs
@@ -216,7 +216,7 @@ impl<'a> TreeNodeRewriter for RangeExprRewriter<'a> {
                let mut data_type = range_expr.get_type(self.input_plan.schema())?;
                let mut need_cast = false;
                let fill = Fill::try_from_str(parse_str_expr(&func.args, 2)?, &data_type)?;
-                if matches!(fill, Fill::Linear) && data_type.is_integer() {
+                if matches!(fill, Some(Fill::Linear)) && data_type.is_integer() {
                    data_type = DataType::Float64;
                    need_cast = true;
                }
@@ -224,12 +224,20 @@ impl<'a> TreeNodeRewriter for RangeExprRewriter<'a> {
                inconsistent_check!(self.align, self.align != Duration::default());
                inconsistent_check!(self.align_to, self.align_to != 0);
                let range_fn = RangeFn {
-                    name: format!(
+                    name: if let Some(fill) = &fill {
-                        "{} RANGE {} FILL {}",
+                        format!(
-                        range_expr.display_name()?,
+                            "{} RANGE {} FILL {}",
-                        parse_expr_to_string(&func.args, 1)?,
+                            range_expr.display_name()?,
-                        fill
+                            parse_expr_to_string(&func.args, 1)?,
-                    ),
+                            fill
                        )
                    } else {
                        format!(
                            "{} RANGE {}",
                            range_expr.display_name()?,
                            parse_expr_to_string(&func.args, 1)?,
                        )
                    },
                    data_type,
                    expr: range_expr,
                    range,
@@ -551,7 +559,7 @@ mod test {
    async fn range_no_project() {
        let query = r#"SELECT timestamp, tag_0, tag_1, avg(field_0 + field_1) RANGE '5m' FROM test ALIGN '1h' by (tag_0,tag_1);"#;
        let expected = String::from(
-            "RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N]\
+            "RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, AVG(test.field_0 + test.field_1) RANGE 5m:Float64;N]\
            \n  TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -561,8 +569,8 @@ mod test {
    async fn range_expr_calculation() {
        let query = r#"SELECT (avg(field_0 + field_1)/4) RANGE '5m' FROM test ALIGN '1h' by (tag_0,tag_1);"#;
        let expected = String::from(
-            "Projection: AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL / Int64(4) [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL / Int64(4):Float64;N]\
+            "Projection: AVG(test.field_0 + test.field_1) RANGE 5m / Int64(4) [AVG(test.field_0 + test.field_1) RANGE 5m / Int64(4):Float64;N]\
-            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -573,8 +581,8 @@ mod test {
        let query =
            r#"SELECT (covar(field_0 + field_1, field_1)/4) RANGE '5m' FROM test ALIGN '1h';"#;
        let expected = String::from(
-            "Projection: COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL / Int64(4) [COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL / Int64(4):Float64;N]\
+            "Projection: COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m / Int64(4) [COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m / Int64(4):Float64;N]\
-            \n  RangeSelect: range_exprs=[COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1, test.tag_2, test.tag_3, test.tag_4], time_index=timestamp [COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8]\
+            \n  RangeSelect: range_exprs=[COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1, test.tag_2, test.tag_3, test.tag_4], time_index=timestamp [COVARIANCE(test.field_0 + test.field_1,test.field_1) RANGE 5m:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8]\
            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -621,8 +629,8 @@ mod test {
    async fn range_in_expr() {
        let query = r#"SELECT sin(avg(field_0 + field_1) RANGE '5m' + 1) FROM test ALIGN '1h' by (tag_0,tag_1);"#;
        let expected = String::from(
-            "Projection: sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1)) [sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1)):Float64;N]\
+            "Projection: sin(AVG(test.field_0 + test.field_1) RANGE 5m + Int64(1)) [sin(AVG(test.field_0 + test.field_1) RANGE 5m + Int64(1)):Float64;N]\
-            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
@@ -643,8 +651,8 @@ mod test {
    async fn deep_nest_range_expr() {
        let query = r#"SELECT round(sin(avg(field_0 + field_1) RANGE '5m' + 1)) FROM test ALIGN '1h' by (tag_0,tag_1);"#;
        let expected = String::from(
-            "Projection: round(sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1))) [round(sin(AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL + Int64(1))):Float64;N]\
+            "Projection: round(sin(AVG(test.field_0 + test.field_1) RANGE 5m + Int64(1))) [round(sin(AVG(test.field_0 + test.field_1) RANGE 5m + Int64(1))):Float64;N]\
-            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m FILL NULL:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
+            \n  RangeSelect: range_exprs=[AVG(test.field_0 + test.field_1) RANGE 5m], align=3600000ms, align_to=0ms, align_by=[test.tag_0, test.tag_1], time_index=timestamp [AVG(test.field_0 + test.field_1) RANGE 5m:Float64;N, timestamp:Timestamp(Millisecond, None), tag_0:Utf8, tag_1:Utf8]\
            \n    TableScan: test [tag_0:Utf8, tag_1:Utf8, tag_2:Utf8, tag_3:Utf8, tag_4:Utf8, timestamp:Timestamp(Millisecond, None), field_0:Float64;N, field_1:Float64;N, field_2:Float64;N, field_3:Float64;N, field_4:Float64;N]"
        );
        query_plan_compare(query, expected).await;
--- a/src/query/src/sql.rs
+++ b/src/query/src/sql.rs
@@ -237,10 +237,9 @@ async fn query_from_information_schema_table(
        .await
        .context(error::DataFusionSnafu)?;
-    Ok(Output::Stream(
+    Ok(Output::new_with_stream(Box::pin(
-        Box::pin(RecordBatchStreamAdapter::try_new(stream).context(error::CreateRecordBatchSnafu)?),
+        RecordBatchStreamAdapter::try_new(stream).context(error::CreateRecordBatchSnafu)?,
-        None,
+    )))
    ))
 }
 pub async fn show_tables(
@@ -303,7 +302,7 @@ pub fn show_variable(stmt: ShowVariables, query_ctx: QueryContextRef) -> Result<
        vec![Arc::new(StringVector::from(vec![value])) as _],
    )
    .context(error::CreateRecordBatchSnafu)?;
-    Ok(Output::RecordBatches(records))
+    Ok(Output::new_with_record_batches(records))
 }
 pub fn show_create_table(
@@ -329,7 +328,7 @@ pub fn show_create_table(
    let records = RecordBatches::try_from_columns(SHOW_CREATE_TABLE_OUTPUT_SCHEMA.clone(), columns)
        .context(error::CreateRecordBatchSnafu)?;
-    Ok(Output::RecordBatches(records))
+    Ok(Output::new_with_record_batches(records))
 }
 pub fn describe_table(table: TableRef) -> Result<Output> {
@@ -345,7 +344,7 @@ pub fn describe_table(table: TableRef) -> Result<Output> {
    ];
    let records = RecordBatches::try_from_columns(DESCRIBE_TABLE_OUTPUT_SCHEMA.clone(), columns)
        .context(error::CreateRecordBatchSnafu)?;
-    Ok(Output::RecordBatches(records))
+    Ok(Output::new_with_record_batches(records))
 }
 fn describe_column_names(columns_schemas: &[ColumnSchema]) -> VectorRef {
@@ -572,7 +571,7 @@ fn parse_file_table_format(options: &HashMap<String, String>) -> Result<Box<dyn
 mod test {
    use std::sync::Arc;
-    use common_query::Output;
+    use common_query::{Output, OutputData};
    use common_recordbatch::{RecordBatch, RecordBatches};
    use common_time::timestamp::TimeUnit;
    use common_time::Timezone;
@@ -642,7 +641,7 @@ mod test {
            RecordBatches::try_from_columns(DESCRIBE_TABLE_OUTPUT_SCHEMA.clone(), expected_columns)
                .context(error::CreateRecordBatchSnafu)?;
-        if let Output::RecordBatches(res) = describe_table(table)? {
+        if let OutputData::RecordBatches(res) = describe_table(table)?.data {
            assert_eq!(res.take(), expected.take());
        } else {
            panic!("describe table must return record batch");
@@ -690,7 +689,10 @@ mod test {
            .timezone(Arc::new(Timezone::from_tz_string(tz).unwrap()))
            .build();
        match show_variable(stmt, ctx) {
-            Ok(Output::RecordBatches(record)) => {
+            Ok(Output {
                data: OutputData::RecordBatches(record),
                ..
            }) => {
                let record = record.take().first().cloned().unwrap();
                let data = record.column(0);
                Ok(data.get(0).to_string())
--- a/src/query/src/tests.rs
+++ b/src/query/src/tests.rs
@@ -13,7 +13,7 @@
 // limitations under the License.
 use catalog::memory::MemoryCatalogManager;
-use common_query::Output;
+use common_query::OutputData;
 use common_recordbatch::{util, RecordBatch};
 use session::context::QueryContext;
 use table::TableRef;
@@ -43,7 +43,7 @@ async fn exec_selection(engine: QueryEngineRef, sql: &str) -> Vec<RecordBatch> {
        .plan(stmt, query_ctx.clone())
        .await
        .unwrap();
-    let Output::Stream(stream, _) = engine.execute(plan, query_ctx).await.unwrap() else {
+    let OutputData::Stream(stream) = engine.execute(plan, query_ctx).await.unwrap().data else {
        unreachable!()
    };
    util::collect(stream).await.unwrap()
--- a/src/query/src/tests/query_engine_test.rs
+++ b/src/query/src/tests/query_engine_test.rs
@@ -20,7 +20,7 @@ use common_base::Plugins;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, NUMBERS_TABLE_ID};
 use common_error::ext::BoxedError;
 use common_query::prelude::{create_udf, make_scalar_function, Volatility};
-use common_query::Output;
+use common_query::OutputData;
 use common_recordbatch::{util, RecordBatch};
 use datafusion::datasource::DefaultTableSource;
 use datafusion_expr::logical_plan::builder::LogicalPlanBuilder;
@@ -79,8 +79,8 @@ async fn test_datafusion_query_engine() -> Result<()> {
    let output = engine.execute(plan, QueryContext::arc()).await?;
-    let recordbatch = match output {
+    let recordbatch = match output.data {
-        Output::Stream(recordbatch, _) => recordbatch,
+        OutputData::Stream(recordbatch) => recordbatch,
        _ => unreachable!(),
    };
--- a/src/script/benches/py_benchmark.rs
+++ b/src/script/benches/py_benchmark.rs
@@ -17,7 +17,7 @@ use std::sync::Arc;
 use catalog::memory::MemoryCatalogManager;
 use common_catalog::consts::NUMBERS_TABLE_ID;
-use common_query::Output;
+use common_query::OutputData;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use futures::Future;
 use once_cell::sync::{Lazy, OnceCell};
@@ -69,9 +69,9 @@ async fn run_compiled(script: &PyScript) {
        .execute(HashMap::default(), EvalContext::default())
        .await
        .unwrap();
-    let _res = match output {
+    let _res = match output.data {
-        Output::Stream(s, _) => common_recordbatch::util::collect_batches(s).await.unwrap(),
+        OutputData::Stream(s) => common_recordbatch::util::collect_batches(s).await.unwrap(),
-        Output::RecordBatches(rbs) => rbs,
+        OutputData::RecordBatches(rbs) => rbs,
        _ => unreachable!(),
    };
 }
--- a/src/script/src/manager.rs
+++ b/src/script/src/manager.rs
@@ -211,6 +211,8 @@ impl<E: ErrorExt + Send + Sync + 'static> ScriptManager<E> {
 #[cfg(test)]
 mod tests {
    use common_query::OutputData;
    use super::*;
    use crate::test::setup_scripts_manager;
@@ -261,8 +263,8 @@ def test() -> vector[str]:
            .await
            .unwrap();
-        match output {
+        match output.data {
-            Output::RecordBatches(batches) => {
+            OutputData::RecordBatches(batches) => {
                let expected = "\
 +-------+
 | n     |
--- a/src/script/src/python/engine.rs
+++ b/src/script/src/python/engine.rs
@@ -25,10 +25,11 @@ use common_function::function::Function;
 use common_function::function_registry::FUNCTION_REGISTRY;
 use common_query::error::{PyUdfSnafu, UdfTempRecordBatchSnafu};
 use common_query::prelude::Signature;
-use common_query::Output;
+use common_query::{Output, OutputData};
 use common_recordbatch::adapter::RecordBatchMetrics;
 use common_recordbatch::error::{ExternalSnafu, Result as RecordBatchResult};
 use common_recordbatch::{
-    RecordBatch, RecordBatchStream, RecordBatches, SendableRecordBatchStream,
+    OrderOption, RecordBatch, RecordBatchStream, RecordBatches, SendableRecordBatchStream,
 };
 use datafusion_expr::Volatility;
 use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
@@ -255,6 +256,14 @@ impl RecordBatchStream for CoprStream {
        // FIXME(discord9): use copr returns for schema
        self.ret_schema.clone()
    }
    fn output_ordering(&self) -> Option<&[OrderOption]> {
        None
    }
    fn metrics(&self) -> Option<RecordBatchMetrics> {
        None
    }
 }
 impl Stream for CoprStream {
@@ -311,10 +320,10 @@ impl Script for PyScript {
                .await
                .context(DatabaseQuerySnafu)?;
            let copr = self.copr.clone();
-            match res {
+            match res.data {
-                Output::Stream(stream, _) => Ok(Output::new_stream(Box::pin(CoprStream::try_new(
+                OutputData::Stream(stream) => Ok(Output::new_with_stream(Box::pin(
-                    stream, copr, params, ctx,
+                    CoprStream::try_new(stream, copr, params, ctx)?,
-                )?))),
+                ))),
                _ => unreachable!(),
            }
        } else {
@@ -324,7 +333,7 @@ impl Script for PyScript {
                .await
                .context(TokioJoinSnafu)??;
            let batches = RecordBatches::try_new(batch.schema.clone(), vec![batch]).unwrap();
-            Ok(Output::RecordBatches(batches))
+            Ok(Output::new_with_record_batches(batches))
        }
    }
 }
@@ -410,8 +419,8 @@ def test(number) -> vector[u32]:
            .execute(HashMap::default(), EvalContext::default())
            .await
            .unwrap();
-        let res = common_recordbatch::util::collect_batches(match output {
+        let res = common_recordbatch::util::collect_batches(match output.data {
-            Output::Stream(s, _) => s,
+            OutputData::Stream(s) => s,
            _ => unreachable!(),
        })
        .await
@@ -441,8 +450,8 @@ def test(**params) -> vector[i64]:
            .execute(params, EvalContext::default())
            .await
            .unwrap();
-        let res = match _output {
+        let res = match _output.data {
-            Output::RecordBatches(s) => s,
+            OutputData::RecordBatches(s) => s,
            _ => todo!(),
        };
        let rb = res.iter().next().expect("One and only one recordbatch");
@@ -471,8 +480,8 @@ def test(number) -> vector[u32]:
            .execute(HashMap::new(), EvalContext::default())
            .await
            .unwrap();
-        let res = common_recordbatch::util::collect_batches(match _output {
+        let res = common_recordbatch::util::collect_batches(match _output.data {
-            Output::Stream(s, _) => s,
+            OutputData::Stream(s) => s,
            _ => todo!(),
        })
        .await
@@ -503,8 +512,8 @@ def test(a, b, c) -> vector[f64]:
            .execute(HashMap::new(), EvalContext::default())
            .await
            .unwrap();
-        match output {
+        match output.data {
-            Output::Stream(stream, _) => {
+            OutputData::Stream(stream) => {
                let numbers = util::collect(stream).await.unwrap();
                assert_eq!(1, numbers.len());
@@ -541,8 +550,8 @@ def test(a) -> vector[i64]:
            .execute(HashMap::new(), EvalContext::default())
            .await
            .unwrap();
-        match output {
+        match output.data {
-            Output::Stream(stream, _) => {
+            OutputData::Stream(stream) => {
                let numbers = util::collect(stream).await.unwrap();
                assert_eq!(1, numbers.len());
--- a/src/script/src/python/ffi_types/copr.rs
+++ b/src/script/src/python/ffi_types/copr.rs
@@ -19,6 +19,7 @@ use std::collections::HashMap;
 use std::result::Result as StdResult;
 use std::sync::{Arc, Weak};
 use common_query::OutputData;
 use common_recordbatch::{RecordBatch, RecordBatches};
 use datatypes::arrow::compute;
 use datatypes::data_type::{ConcreteDataType, DataType};
@@ -399,13 +400,14 @@ impl PyQueryEngine {
                        .await
                        .map_err(|e| e.to_string());
                    match res {
-                        Ok(common_query::Output::AffectedRows(cnt)) => {
+                        Ok(o) => match o.data {
-                            Ok(Either::AffectedRows(cnt))
+                            OutputData::AffectedRows(cnt) => Ok(Either::AffectedRows(cnt)),
-                        }
+                            OutputData::RecordBatches(rbs) => Ok(Either::Rb(rbs)),
-                        Ok(common_query::Output::RecordBatches(rbs)) => Ok(Either::Rb(rbs)),
+                            OutputData::Stream(s) => Ok(Either::Rb(
-                        Ok(common_query::Output::Stream(s, _)) => Ok(Either::Rb(
+                                common_recordbatch::util::collect_batches(s).await.unwrap(),
-                            common_recordbatch::util::collect_batches(s).await.unwrap(),
+                            )),
-                        )),
+                        },
                        Err(e) => Err(e),
                    }
                })?;
--- a/src/script/src/python/ffi_types/pair_tests.rs
+++ b/src/script/src/python/ffi_types/pair_tests.rs
@@ -18,7 +18,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use arrow::compute::kernels::numeric;
-use common_query::Output;
+use common_query::OutputData;
 use common_recordbatch::RecordBatch;
 use datafusion::arrow::array::Float64Array;
 use datafusion::arrow::compute;
@@ -87,9 +87,9 @@ async fn integrated_py_copr_test() {
            .execute(HashMap::default(), EvalContext::default())
            .await
            .unwrap();
-        let res = match output {
+        let res = match output.data {
-            Output::Stream(s, _) => common_recordbatch::util::collect_batches(s).await.unwrap(),
+            OutputData::Stream(s) => common_recordbatch::util::collect_batches(s).await.unwrap(),
-            Output::RecordBatches(rbs) => rbs,
+            OutputData::RecordBatches(rbs) => rbs,
            _ => unreachable!(),
        };
        let rb = res.iter().next().expect("One and only one recordbatch");
--- a/src/script/src/table.rs
+++ b/src/script/src/table.rs
@@ -24,7 +24,7 @@ use api::v1::{
 };
 use catalog::error::CompileScriptInternalSnafu;
 use common_error::ext::{BoxedError, ErrorExt};
-use common_query::Output;
+use common_query::OutputData;
 use common_recordbatch::{util as record_util, RecordBatch, SendableRecordBatchStream};
 use common_telemetry::logging;
 use common_time::util;
@@ -230,9 +230,9 @@ impl<E: ErrorExt + Send + Sync + 'static> ScriptsTable<E> {
            .execute(LogicalPlan::DfPlan(plan), query_ctx(&table_info))
            .await
            .context(ExecuteInternalStatementSnafu)?;
-        let stream = match output {
+        let stream = match output.data {
-            Output::Stream(stream, _) => stream,
+            OutputData::Stream(stream) => stream,
-            Output::RecordBatches(record_batches) => record_batches.as_stream(),
+            OutputData::RecordBatches(record_batches) => record_batches.as_stream(),
            _ => unreachable!(),
        };
@@ -285,9 +285,9 @@ impl<E: ErrorExt + Send + Sync + 'static> ScriptsTable<E> {
            .execute(LogicalPlan::DfPlan(plan), query_ctx(&table_info))
            .await
            .context(ExecuteInternalStatementSnafu)?;
-        let stream = match output {
+        let stream = match output.data {
-            Output::Stream(stream, _) => stream,
+            OutputData::Stream(stream) => stream,
-            Output::RecordBatches(record_batches) => record_batches.as_stream(),
+            OutputData::RecordBatches(record_batches) => record_batches.as_stream(),
            _ => unreachable!(),
        };
        Ok(stream)
--- a/src/script/src/test.rs
+++ b/src/script/src/test.rs
@@ -73,6 +73,6 @@ impl GrpcQueryHandler for MockGrpcQueryHandler {
    type Error = Error;
    async fn do_query(&self, _query: Request, _ctx: QueryContextRef) -> Result<Output> {
-        Ok(Output::AffectedRows(1))
+        Ok(Output::new_with_affected_rows(1))
    }
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ruihang Xia	038bc4fe6e	revert toml format Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-14 00:46:00 +08:00
Ruihang Xia	6d07c422d8	Merge branch 'main' into fix-proto-clear Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-14 00:36:28 +08:00
Ruihang Xia	6c14ece23f	accomplish test assertion Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-14 00:32:49 +08:00
Ruihang Xia	89c51d9b87	reset Sample Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-13 23:32:22 +08:00
Weny Xu	e4333969b4	feat(fuzz): add alter table target (#3503 ) * feat(fuzz): validate semantic type of column * feat(fuzz): add fuzz_alter_table target * feat(fuzz): validate columns * chore(ci): add fuzz_alter_table ci cfg	2024-03-13 14:11:47 +00:00
Zhenchi	b55905cf66	feat(fuzz): add insert target (#3499 ) * fix(common-time): allow building nanos timestamp from parts split from i64::MIN Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * feat(fuzz): add insert target Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * chore: cleanup cargo.toml and polish comments Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2024-03-13 10:03:03 +00:00
WU Jingdi	fb4da05f25	fix: adjust fill behavior of range query (#3489 )	2024-03-13 09:20:34 +00:00
Zhenchi	904484b525	fix(common-time): allow building nanos timestamp from parts split from i64::MIN (#3493 ) Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2024-03-13 02:46:00 +00:00
tison	cafb4708ce	refactor: validate constraints eagerly (#3472 ) * chore: validate constraints eagerly Signed-off-by: tison <wander4096@gmail.com> * use timestamp column Signed-off-by: tison <wander4096@gmail.com> * fixup Signed-off-by: tison <wander4096@gmail.com> * lint Signed-off-by: tison <wander4096@gmail.com> * compile Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com>	2024-03-12 13:09:34 +00:00
Yingwen	7c895e2605	perf: more benchmarks for memtables (#3491 ) * chore: remove duplicate bench * refactor: rename bench * perf: add full scan bench for memtable * feat: filter bench and add time series to bench group * chore: comment * refactor: rename * style: fix clippy	2024-03-12 12:02:58 +00:00
Lei, HUANG	9afe327bca	feat: improve prom write requests decode performance (#3478 ) * feat: optimize decode performance * fix: some cr comments	2024-03-12 12:00:38 +00:00
discord9	58bd065c6b	feat(flow): plan def (#3490 ) * feat: plan def * chore: add license * docs: remove TODO done * chore: add derive Ord	2024-03-12 10:59:07 +00:00
Yingwen	9aa8f756ab	fix: allow passing extra table options (#3484 ) * fix: do not check options in parser * test: fix tests * test: fix sqlness * test: add sqlness test * chore: log options * chore: must specify compaction type * feat: validate option key * feat: add option key validation back	2024-03-12 07:03:52 +00:00
discord9	7639c227ca	feat(flow): accumlator for aggr func (#3396 ) * feat: Accumlator trait * feat: add `OrdValue` accum&use enum_dispatch * test: more accum test * feat: eval aggr funcs * chore: refactor test&fmt clippy * refactor: less verbose * test: more tests * refactor: better err handling&use OrdValue for Count * refactor: ignore null&more tests for error handle * refactor: OrdValue accum * chore: extract null check * refactor: def&use fn signature * chore: use extra cond with match guard * chore: per review	2024-03-12 02:09:27 +00:00
tison	1255c1fc9e	feat: to_timezone function (#3470 ) * feat: to_timezone function Signed-off-by: tison <wander4096@gmail.com> * impl Function for ToTimezoneFunction Signed-off-by: tison <wander4096@gmail.com> * add test Signed-off-by: tison <wander4096@gmail.com> * Add original authors Co-authored-by: parkma99 <park-ma@hotmail.com> Co-authored-by: Yingwen <realevenyag@gmail.com> * fixup Signed-off-by: tison <wander4096@gmail.com> * address comments Signed-off-by: tison <wander4096@gmail.com> * add issue link Signed-off-by: tison <wander4096@gmail.com> * code refactor Signed-off-by: tison <wander4096@gmail.com> * further tidy Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: parkma99 <park-ma@hotmail.com> Co-authored-by: Yingwen <realevenyag@gmail.com>	2024-03-12 01:46:19 +00:00
Yingwen	06dcd0f6ed	fix: freeze data buffer in shard (#3468 ) * feat: call freeze if the active data buffer in a shard is full * chore: more metrics * chore: print metrics * chore: enlarge freeze threshold * test: test freeze * test: fix config test	2024-03-11 14:51:06 +00:00
Weny Xu	0a4444a43a	feat(fuzz): validate columns (#3485 )	2024-03-11 11:34:50 +00:00
Ruihang Xia	b7ac8d6aa8	ci: use another mirror for etcd image (#3486 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-11 10:40:19 +00:00
Weny Xu	e767f37241	fix: fix f64 has no sufficient precision during parsing (#3483 )	2024-03-11 09:28:40 +00:00
JeremyHi	da098f5568	fix: make max-txn-ops limit valid (#3481 )	2024-03-11 09:27:51 +00:00
shuiyisong	aa953dcc34	fix: impl `RecordBatchStream` method explicitly (#3482 ) fix: impl RecordBatchStream method explicitly	2024-03-11 09:07:10 +00:00
crwen	aa125a50f9	refactor: make http api returns non-200 status code (#3473 ) * refactor: make http api returns non-200 status code * recover some code	2024-03-11 03:38:36 +00:00
Ruihang Xia	d8939eb891	feat: clamp function (#3465 ) * basic impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add unit tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * a little type exercise Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add sqlness case Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-11 03:26:10 +00:00
shuiyisong	0bb949787c	refactor: introduce new `Output` with `OutputMeta` (#3466 ) * refactor: introduce new output struct * chore: add helper function * chore: update comment * chore: update commit Co-authored-by: Ruihang Xia <waynestxia@gmail.com> * chore: rename according to cr --------- Co-authored-by: Ruihang Xia <waynestxia@gmail.com>	2024-03-11 02:24:09 +00:00
WU Jingdi	8c37c3fc0f	feat: support `first_value/last_value` in range query (#3448 ) * feat: support `first_value/last_value` in range query * chore: add sqlness test on `count` * chore: add test	2024-03-11 01:30:39 +00:00
gcmutator	21ff3620be	chore: remove repetitive words (#3469 ) remove repetitive words Signed-off-by: gcmutator <329964069@qq.com>	2024-03-09 04:18:47 +00:00
Eugene Tolbakov	aeca0d8e8a	feat(influxdb): add db query param support for v2 write api (#3445 ) * feat(influxdb): add db query param support for v2 write api * fix(influxdb): update authorize logic to get catalog and schema from query string * fix(influxdb): address CR suggestions * fix(influxdb): use the correct import	2024-03-08 08:17:57 +00:00
Weny Xu	a309cd018a	fix: fix incorrect `COM_STMT_PREPARE` reply (#3463 ) * fix: fix incorrect `COM_STMT_PREPARE` reply * chore: use column name instead of index	2024-03-08 07:31:20 +00:00
Yingwen	3ee53360ee	perf: Reduce decode overhead during pruning keys in the memtable (#3415 ) * feat: reuse value buf * feat: skip values to decode * feat: prune shard chore: fix compiler errors refactor: shard prune metrics * fix: panic on DedupReader::try_new * fix: prune after next * chore: num parts metrics * feat: metrics and logs * chore: data build cost * chore: more logs * feat: cache skip result * chore: todo * fix: index out of bound * test: test codec * fix: invalid offsets * fix: skip binary * fix: offset buffer reuse * chore: comment * test: test memtable filter * style: fix clippy * chore: fix compiler error	2024-03-08 02:54:00 +00:00
JeremyHi	352bd7b6fd	feat: max-txn-ops option (#3458 ) * feat: max-txn-ops limit * chore: by comment	2024-03-08 02:34:40 +00:00
Weny Xu	3f3ef2e7af	refactor: separate the quote char and value (#3455 ) refactor: use ident instead of string	2024-03-07 08:24:09 +00:00
Weny Xu	a218f12bd9	test: add fuzz test for create table (#3441 ) * feat: add create table fuzz test * chore: add ci cfg for fuzz tests * refactor: remove redundant nightly config * chore: run fuzz test in debug mode * chore: use ubuntu-latest * fix: close connection * chore: add cache in fuzz test ci * chore: apply suggestion from CR * chore: apply suggestion from CR * chore: refactor the fuzz test action	2024-03-07 06:51:19 +00:00
ZonaHe	c884c56151	feat: update dashboard to v0.4.8 (#3450 ) Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>	2024-03-07 04:06:07 +00:00
Weny Xu	9ec288cab9	chore: specify binary name (#3449 )	2024-03-07 03:56:24 +00:00
LFC	1f1491e429	feat: impl some "set"s to adapt to some client apps (#3443 )	2024-03-06 13:15:48 +00:00
Weny Xu	c52bc613e0	chore: add bin opt to build cmd (#3440 )	2024-03-06 08:24:55 +00:00
shuiyisong	a9d42f7b87	fix: add support for influxdb basic auth (#3437 )	2024-03-06 03:56:25 +00:00
tison	86ce2d8713	build(deps): upgrade opendal to 0.45.1 (#3432 ) * build(deps): upgrade opendal to 0.45.1 Signed-off-by: tison <wander4096@gmail.com> * Update src/object-store/Cargo.toml Co-authored-by: Weny Xu <wenymedia@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: Weny Xu <wenymedia@gmail.com>	2024-03-06 03:08:59 +00:00