Merge branch 'main' into create-view

add statement
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2026-01-07 13:52:59 +00:00 · 2024-04-22 21:08:22 +08:00 · 2024-04-17 19:13:54 +08:00
160 changed files with 2754 additions and 6591 deletions
--- a/.github/actions/build-dev-builder-images/action.yml
+++ b/.github/actions/build-dev-builder-images/action.yml
@@ -22,15 +22,15 @@ inputs:
  build-dev-builder-ubuntu:
    description: Build dev-builder-ubuntu image
    required: false
-    default: "true"
+    default: 'true'
  build-dev-builder-centos:
    description: Build dev-builder-centos image
    required: false
-    default: "true"
+    default: 'true'
  build-dev-builder-android:
    description: Build dev-builder-android image
    required: false
-    default: "true"
+    default: 'true'
 runs:
  using: composite
  steps:
@@ -47,7 +47,7 @@ runs:
      run: |
        make dev-builder \
          BASE_IMAGE=ubuntu \
-          BUILDX_MULTI_PLATFORM_BUILD=all \
+          BUILDX_MULTI_PLATFORM_BUILD=true \
          IMAGE_REGISTRY=${{ inputs.dockerhub-image-registry }} \
          IMAGE_NAMESPACE=${{ inputs.dockerhub-image-namespace }} \
          IMAGE_TAG=${{ inputs.version }}
@@ -58,7 +58,7 @@ runs:
      run: |
        make dev-builder \
          BASE_IMAGE=centos \
-          BUILDX_MULTI_PLATFORM_BUILD=amd64 \
+          BUILDX_MULTI_PLATFORM_BUILD=true \
          IMAGE_REGISTRY=${{ inputs.dockerhub-image-registry }} \
          IMAGE_NAMESPACE=${{ inputs.dockerhub-image-namespace }} \
          IMAGE_TAG=${{ inputs.version }}
--- a/.github/actions/build-linux-artifacts/action.yml
+++ b/.github/actions/build-linux-artifacts/action.yml
@@ -16,7 +16,7 @@ inputs:
  dev-mode:
    description: Enable dev mode, only build standard greptime
    required: false
-    default: "false"
+    default: 'false'
  working-dir:
    description: Working directory to build the artifacts
    required: false
@@ -68,7 +68,7 @@ runs:
    - name: Build greptime on centos base image
      uses: ./.github/actions/build-greptime-binary
-      if: ${{ inputs.arch == 'amd64' && inputs.dev-mode == 'false' }} # Builds greptime for centos if the host machine is amd64.
+      if: ${{ inputs.arch == 'amd64' && inputs.dev-mode == 'false' }} # Only build centos7 base image for amd64.
      with:
        base-image: centos
        features: servers/dashboard
@@ -79,7 +79,7 @@ runs:
    - name: Build greptime on android base image
      uses: ./.github/actions/build-greptime-binary
-      if: ${{ inputs.arch == 'amd64' && inputs.dev-mode == 'false' }} # Builds arm64 greptime binary for android if the host machine amd64.
+      if: ${{ inputs.arch == 'amd64' && inputs.dev-mode == 'false' }} # Only build android base image on amd64.
      with:
        base-image: android
        artifacts-dir: greptime-android-arm64-${{ inputs.version }}
--- a/.github/actions/build-windows-artifacts/action.yml
+++ b/.github/actions/build-windows-artifacts/action.yml
@@ -26,6 +26,8 @@ runs:
  using: composite
  steps:
    - uses: arduino/setup-protoc@v3
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
    - name: Install rust toolchain
      uses: dtolnay/rust-toolchain@master
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -147,9 +147,8 @@ jobs:
      - name: Set Rust Fuzz
        shell: bash
        run: |
-          sudo apt-get install -y libfuzzer-14-dev
+          sudo apt update && sudo apt install -y libfuzzer-14-dev
-          rustup install nightly
+          cargo install cargo-fuzz
          cargo +nightly install cargo-fuzz
      - name: Download pre-built binaries
        uses: actions/download-artifact@v4
        with:
@@ -185,13 +184,13 @@ jobs:
      - name: Unzip binaries
        run: tar -xvf ./bins.tar.gz
      - name: Run sqlness
-        run: RUST_BACKTRACE=1 ./bins/sqlness-runner -c ./tests/cases --bins-dir ./bins --preserve-state
+        run: RUST_BACKTRACE=1 ./bins/sqlness-runner -c ./tests/cases --bins-dir ./bins
      - name: Upload sqlness logs
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: sqlness-logs
-          path: /tmp/sqlness-*
+          path: /tmp/greptime-*.log
          retention-days: 3
  sqlness-kafka-wal:
@@ -215,13 +214,13 @@ jobs:
        working-directory: tests-integration/fixtures/kafka
        run: docker compose -f docker-compose-standalone.yml up -d --wait
      - name: Run sqlness
-        run: RUST_BACKTRACE=1 ./bins/sqlness-runner -w kafka -k 127.0.0.1:9092 -c ./tests/cases --bins-dir ./bins --preserve-state
+        run: RUST_BACKTRACE=1 ./bins/sqlness-runner -w kafka -k 127.0.0.1:9092 -c ./tests/cases --bins-dir ./bins
      - name: Upload sqlness logs
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: sqlness-logs-with-kafka-wal
-          path: /tmp/sqlness-*
+          path: /tmp/greptime-*.log
          retention-days: 3
  fmt:
@@ -331,20 +330,20 @@ jobs:
          fail_ci_if_error: false
          verbose: true
-  # compat:
+  compat:
-  #   name: Compatibility Test
+    name: Compatibility Test
-  #   needs: build
+    needs: build
-  #   runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04
-  #   timeout-minutes: 60
+    timeout-minutes: 60
-  #   steps:
+    steps:
-  #     - uses: actions/checkout@v4
+      - uses: actions/checkout@v4
-  #     - name: Download pre-built binaries
+      - name: Download pre-built binaries
-  #       uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v4
-  #       with:
+        with:
-  #         name: bins
+          name: bins
-  #         path: .
+          path: .
-  #     - name: Unzip binaries
+      - name: Unzip binaries
-  #       run: |
+        run: |
-  #         mkdir -p ./bins/current
+          mkdir -p ./bins/current
-  #         tar -xvf ./bins.tar.gz --strip-components=1 -C ./bins/current
+          tar -xvf ./bins.tar.gz --strip-components=1 -C ./bins/current
-  #     - run: ./tests/compat/test-compat.sh 0.6.0
+      - run: ./tests/compat/test-compat.sh 0.6.0
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4150,8 +4150,8 @@ dependencies = [
 [[package]]
 name = "hydroflow"
-version = "0.6.2"
+version = "0.6.0"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "bincode",
 "byteorder",
@@ -4183,7 +4183,7 @@ dependencies = [
 [[package]]
 name = "hydroflow_datalog"
 version = "0.6.0"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "hydroflow_datalog_core",
 "proc-macro-crate 1.3.1",
@@ -4194,8 +4194,8 @@ dependencies = [
 [[package]]
 name = "hydroflow_datalog_core"
-version = "0.6.1"
+version = "0.6.0"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "hydroflow_lang",
 "proc-macro-crate 1.3.1",
@@ -4209,8 +4209,8 @@ dependencies = [
 [[package]]
 name = "hydroflow_lang"
-version = "0.6.2"
+version = "0.6.0"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "auto_impl",
 "clap 4.5.4",
@@ -4230,7 +4230,7 @@ dependencies = [
 [[package]]
 name = "hydroflow_macro"
 version = "0.6.0"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "hydroflow_lang",
 "itertools 0.10.5",
@@ -4610,9 +4610,9 @@ checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
 [[package]]
 name = "jobserver"
-version = "0.1.31"
+version = "0.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
+checksum = "685a7d121ee3f65ae4fddd72b25a04bb36b6af81bc0828f7d5434c0fe60fa3a2"
 dependencies = [
 "libc",
 ]
@@ -4723,8 +4723,8 @@ dependencies = [
 [[package]]
 name = "lattices"
-version = "0.5.4"
+version = "0.5.3"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "cc-traits",
 "sealed",
@@ -5416,7 +5416,6 @@ dependencies = [
 "common-wal",
 "crc32fast",
 "criterion",
 "crossbeam-utils",
 "datafusion",
 "datafusion-common",
 "datafusion-expr",
@@ -6299,6 +6298,7 @@ dependencies = [
 "sql",
 "sqlparser 0.44.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=c919990bf62ad38d2b0c0a3bc90b26ad919d51b0)",
 "store-api",
 "substrait 0.7.2",
 "table",
 "tokio",
 "tonic 0.11.0",
@@ -7377,7 +7377,7 @@ checksum = "3b7e158a385023d209d6d5f2585c4b468f6dcb3dd5aca9b75c4f1678c05bb375"
 [[package]]
 name = "pusherator"
 version = "0.0.5"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "either",
 "variadics",
@@ -9540,7 +9540,6 @@ dependencies = [
 "serde",
 "serde_json",
 "sqlness",
 "tempfile",
 "tinytemplate",
 "tokio",
 ]
@@ -11149,7 +11148,7 @@ version = "1.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
 dependencies = [
- "cfg-if 0.1.10",
+ "cfg-if 1.0.0",
 "rand",
 "static_assertions",
 ]
@@ -11563,7 +11562,7 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 [[package]]
 name = "variadics"
 version = "0.0.4"
-source = "git+https://github.com/GreptimeTeam/hydroflow.git?branch=main#b072ee026f97f8537165e1fb247101e0ab2fb320"
+source = "git+https://github.com/GreptimeTeam/hydroflow.git?rev=ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94#ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94"
 dependencies = [
 "sealed",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -98,7 +98,6 @@ bytemuck = "1.12"
 bytes = { version = "1.5", features = ["serde"] }
 chrono = { version = "0.4", features = ["serde"] }
 clap = { version = "4.4", features = ["derive"] }
 crossbeam-utils = "0.8"
 dashmap = "5.4"
 datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "34eda15b73a9e278af8844b30ed2f1c21c10359c" }
 datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", rev = "34eda15b73a9e278af8844b30ed2f1c21c10359c" }
--- a/4
+++ b/4
@@ -54,10 +54,8 @@ ifneq ($(strip $(RELEASE)),)
 	CARGO_BUILD_OPTS += --release
 endif
-ifeq ($(BUILDX_MULTI_PLATFORM_BUILD), all)
+ifeq ($(BUILDX_MULTI_PLATFORM_BUILD), true)
 	BUILDX_MULTI_PLATFORM_BUILD_OPTS := --platform linux/amd64,linux/arm64 --push
 else ifeq ($(BUILDX_MULTI_PLATFORM_BUILD), amd64)
 	BUILDX_MULTI_PLATFORM_BUILD_OPTS := --platform linux/amd64 --push
 else
 	BUILDX_MULTI_PLATFORM_BUILD_OPTS := -o type=docker
 endif
--- a/docs/how-to/how-to-write-fuzz-tests.md
+++ b/docs/how-to/how-to-write-fuzz-tests.md
@@ -1,136 +0,0 @@
 # How to write fuzz tests
 This document introduces how to write fuzz tests in GreptimeDB.
 ## What is a fuzz test
 Fuzz test is tool that leverage deterministic random generation to assist in finding bugs. The goal of fuzz tests is to identify inputs generated by the fuzzer that cause system panics, crashes, or unexpected behaviors to occur. And we are using the [cargo-fuzz](https://github.com/rust-fuzz/cargo-fuzz) to run our fuzz test targets. 
 ## Why we need them
 - Find bugs by leveraging random generation
 - Integrate with other tests (e.g., e2e)
 ## Resources
 All fuzz test-related resources are located in the `/tests-fuzz` directory.
 There are two types of resources: (1) fundamental components and (2) test targets.
 ### Fundamental components 
 They are located in the `/tests-fuzz/src` directory. The fundamental components define how to generate SQLs (including dialects for different protocols) and validate execution results (e.g., column attribute validation), etc.
 ### Test targets
 They are located in the `/tests-fuzz/targets` directory, with each file representing an independent fuzz test case. The target utilizes fundamental components to generate SQLs, sends the generated SQLs via specified protocol, and validates the results of SQL execution.
 Figure 1 illustrates the fundamental components of the fuzz test provide the ability to generate random SQLs. It utilizes a Random Number Generator (Rng) to generate the Intermediate Representation (IR), then employs a DialectTranslator to produce specified dialects for different protocols. Finally, the fuzz tests send the generated SQL via the specified protocol and verify that the execution results meet expectations.
 ```
                            Rng                                 
                             |                                  
                             |                                  
                             v                                  
                       ExprGenerator                            
                             |                                  
                             |                                  
                             v                                  
               Intermediate representation (IR)                 
                             |                                  
                             |                                  
      +----------------------+----------------------+           
      |                      |                      |           
      v                      v                      v           
 MySQLTranslator    PostgreSQLTranslator   OtherDialectTranslator
      |                      |                      |           
      |                      |                      |           
      v                      v                      v           
 SQL(MySQL Dialect)         .....                  .....         
      |
      |
      v
  Fuzz Test
 ```
 (Figure1: Overview of fuzz tests)
 For more details about fuzz targets and fundamental components, please refer to this [tracking issue](https://github.com/GreptimeTeam/greptimedb/issues/3174).
 ## How to add a fuzz test target
 1. Create an empty rust source file under the `/tests-fuzz/targets/<fuzz-target>.rs` directory.
 2. Register the fuzz test target in the `/tests-fuzz/Cargo.toml` file.
 ```toml
 [[bin]]
 name = "<fuzz-target>"
 path = "targets/<fuzz-target>.rs"
 test = false
 bench = false
 doc = false
 ```
 3. Define the `FuzzInput` in the `/tests-fuzz/targets/<fuzz-target>.rs`.
 ```rust
 #![no_main]
 use libfuzzer_sys::arbitrary::{Arbitrary, Unstructured};
 #[derive(Clone, Debug)]
 struct FuzzInput {
    seed: u64,
 }
 impl Arbitrary<'_> for FuzzInput {
    fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
        let seed = u.int_in_range(u64::MIN..=u64::MAX)?;
        Ok(FuzzInput { seed })
    }
 }
 ```
 4. Write your first fuzz test target in the `/tests-fuzz/targets/<fuzz-target>.rs`.
 ```rust
 use libfuzzer_sys::fuzz_target;
 use rand::{Rng, SeedableRng};
 use rand_chacha::ChaChaRng;
 use snafu::ResultExt;
 use sqlx::{MySql, Pool};
 use tests_fuzz::fake::{
    merge_two_word_map_fn, random_capitalize_map, uppercase_and_keyword_backtick_map,
    MappedGenerator, WordGenerator,
 };
 use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
 use tests_fuzz::generator::Generator;
 use tests_fuzz::ir::CreateTableExpr;
 use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
 use tests_fuzz::translator::DslTranslator;
 use tests_fuzz::utils::{init_greptime_connections, Connections};
 fuzz_target!(|input: FuzzInput| {
    common_telemetry::init_default_ut_logging();
    common_runtime::block_on_write(async {
        let Connections { mysql } = init_greptime_connections().await;
            let mut rng = ChaChaRng::seed_from_u64(input.seed);
            let columns = rng.gen_range(2..30);
            let create_table_generator = CreateTableExprGeneratorBuilder::default()
                .name_generator(Box::new(MappedGenerator::new(
                    WordGenerator,
                    merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map),
                )))
                .columns(columns)
                .engine("mito")
                .if_not_exists(if_not_exists)
                .build()
                .unwrap();
            let ir = create_table_generator.generate(&mut rng);
            let translator = CreateTableExprTranslator;
            let sql = translator.translate(&expr).unwrap();
            mysql.execute(&sql).await
    })
 });
 ```
 5. Run your fuzz test target
 ```bash
    cargo fuzz run <fuzz-target> --fuzz-dir tests-fuzz
 ```
 For more details, please refer to this [document](/tests-fuzz/README.md).
--- a/docs/schema-structs.md
+++ b/docs/schema-structs.md
@@ -73,7 +73,7 @@ CREATE TABLE cpu (
    usage_system DOUBLE,
    datacenter STRING,
    TIME INDEX (ts),
-    PRIMARY KEY(datacenter, host)) ENGINE=mito;
+    PRIMARY KEY(datacenter, host)) ENGINE=mito WITH(regions=1);
 ```
 Then the table's `TableMeta` may look like this:
@@ -249,7 +249,7 @@ CREATE TABLE cpu (
    usage_system DOUBLE,
    datacenter STRING,
    TIME INDEX (ts),
-    PRIMARY KEY(datacenter, host)) ENGINE=mito;
+    PRIMARY KEY(datacenter, host)) ENGINE=mito WITH(regions=1);
 select ts, usage_system from cpu;
 ```
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -36,7 +36,6 @@ common-telemetry = { workspace = true, features = [
    "deadlock_detection",
 ] }
 common-time.workspace = true
 common-version.workspace = true
 common-wal.workspace = true
 config = "0.13"
 datanode.workspace = true
--- a/src/cmd/src/bin/greptime.rs
+++ b/src/cmd/src/bin/greptime.rs
@@ -22,7 +22,6 @@ use cmd::options::{CliOptions, Options};
 use cmd::{
    cli, datanode, frontend, greptimedb_cli, log_versions, metasrv, standalone, start_app, App,
 };
 use common_version::{short_version, version};
 #[derive(Parser)]
 enum SubCommand {
@@ -106,8 +105,7 @@ async fn main() -> Result<()> {
    common_telemetry::set_panic_hook();
-    let version = version!();
+    let cli = greptimedb_cli();
    let cli = greptimedb_cli().version(version);
    let cli = SubCommand::augment_subcommands(cli);
@@ -131,7 +129,7 @@ async fn main() -> Result<()> {
        opts.node_id(),
    );
-    log_versions(version, short_version!());
+    log_versions();
    let app = subcmd.build(opts).await?;
--- a/src/cmd/src/cli/export.rs
+++ b/src/cmd/src/cli/export.rs
@@ -492,7 +492,9 @@ mod tests {
 )
 ENGINE=mito
-;
+WITH(
  regions = 1
 );
 "#;
        assert_eq!(res.trim(), expect.trim());
--- a/src/cmd/src/cli/upgrade.rs
+++ b/src/cmd/src/cli/upgrade.rs
@@ -192,10 +192,10 @@ impl MigrateTableMetadata {
                let key = v1SchemaKey::parse(key_str)
                    .unwrap_or_else(|e| panic!("schema key is corrupted: {e}, key: {key_str}"));
-                Ok(key)
+                Ok((key, ()))
            }),
        );
-        while let Some(key) = stream.try_next().await.context(error::IterStreamSnafu)? {
+        while let Some((key, _)) = stream.try_next().await.context(error::IterStreamSnafu)? {
            let _ = self.migrate_schema_key(&key).await;
            keys.push(key.to_string().as_bytes().to_vec());
        }
@@ -244,10 +244,10 @@ impl MigrateTableMetadata {
                let key = v1CatalogKey::parse(key_str)
                    .unwrap_or_else(|e| panic!("catalog key is corrupted: {e}, key: {key_str}"));
-                Ok(key)
+                Ok((key, ()))
            }),
        );
-        while let Some(key) = stream.try_next().await.context(error::IterStreamSnafu)? {
+        while let Some((key, _)) = stream.try_next().await.context(error::IterStreamSnafu)? {
            let _ = self.migrate_catalog_key(&key).await;
            keys.push(key.to_string().as_bytes().to_vec());
        }
--- a/src/cmd/src/lib.rs
+++ b/src/cmd/src/lib.rs
@@ -64,23 +64,26 @@ pub async fn start_app(mut app: Box<dyn App>) -> error::Result<()> {
    Ok(())
 }
-/// Log the versions of the application, and the arguments passed to the cli.
+pub fn log_versions() {
 /// `version_string` should be the same as the output of cli "--version";
 /// and the `app_version` is the short version of the codes, often consist of git branch and commit.
 pub fn log_versions(version_string: &str, app_version: &str) {
    // Report app version as gauge.
    APP_VERSION
-        .with_label_values(&[env!("CARGO_PKG_VERSION"), app_version])
+        .with_label_values(&[short_version(), full_version()])
        .inc();
    // Log version and argument flags.
-    info!("GreptimeDB version: {}", version_string);
+    info!(
        "short_version: {}, full_version: {}",
        short_version(),
        full_version()
    );
    log_env_flags();
 }
 pub fn greptimedb_cli() -> clap::Command {
-    let cmd = clap::Command::new("greptimedb").subcommand_required(true);
+    let cmd = clap::Command::new("greptimedb")
        .version(print_version())
        .subcommand_required(true);
    #[cfg(feature = "tokio-console")]
    let cmd = cmd.arg(arg!(--"tokio-console-addr"[TOKIO_CONSOLE_ADDR]));
@@ -88,6 +91,35 @@ pub fn greptimedb_cli() -> clap::Command {
    cmd.args([arg!(--"log-dir"[LOG_DIR]), arg!(--"log-level"[LOG_LEVEL])])
 }
 fn print_version() -> &'static str {
    concat!(
        "\nbranch: ",
        env!("GIT_BRANCH"),
        "\ncommit: ",
        env!("GIT_COMMIT"),
        "\ndirty: ",
        env!("GIT_DIRTY"),
        "\nversion: ",
        env!("CARGO_PKG_VERSION")
    )
 }
 fn short_version() -> &'static str {
    env!("CARGO_PKG_VERSION")
 }
 // {app_name}-{branch_name}-{commit_short}
 // The branch name (tag) of a release build should already contain the short
 // version so the full version doesn't concat the short version explicitly.
 fn full_version() -> &'static str {
    concat!(
        "greptimedb-",
        env!("GIT_BRANCH"),
        "-",
        env!("GIT_COMMIT_SHORT")
    )
 }
 fn log_env_flags() {
    info!("command line arguments");
    for argument in std::env::args() {
--- a/src/common/error/src/status_code.rs
+++ b/src/common/error/src/status_code.rs
@@ -59,7 +59,6 @@ pub enum StatusCode {
    RegionNotFound = 4005,
    RegionAlreadyExists = 4006,
    RegionReadonly = 4007,
    /// Region is not in a proper state to handle specific request.
    RegionNotReady = 4008,
    // If mutually exclusive operations are reached at the same time,
    // only one can be executed, another one will get region busy.
--- a/src/common/grpc/src/lib.rs
+++ b/src/common/grpc/src/lib.rs
@@ -15,7 +15,7 @@
 pub mod channel_manager;
 pub mod error;
 pub mod flight;
 pub mod precision;
 pub mod select;
 pub mod writer;
 pub use error::Error;
--- a/src/common/grpc/src/precision.rs
+++ b/src/common/grpc/src/precision.rs
@@ -1,141 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::fmt::Display;
 use common_time::timestamp::TimeUnit;
 use crate::Error;
 /// Precision represents the precision of a timestamp.
 /// It is used to convert timestamps between different precisions.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Precision {
    Nanosecond,
    Microsecond,
    Millisecond,
    Second,
    Minute,
    Hour,
 }
 impl Precision {
    pub fn to_nanos(&self, amount: i64) -> Option<i64> {
        match self {
            Precision::Nanosecond => Some(amount),
            Precision::Microsecond => amount.checked_mul(1_000),
            Precision::Millisecond => amount.checked_mul(1_000_000),
            Precision::Second => amount.checked_mul(1_000_000_000),
            Precision::Minute => amount
                .checked_mul(60)
                .and_then(|a| a.checked_mul(1_000_000_000)),
            Precision::Hour => amount
                .checked_mul(3600)
                .and_then(|a| a.checked_mul(1_000_000_000)),
        }
    }
    pub fn to_millis(&self, amount: i64) -> Option<i64> {
        match self {
            Precision::Nanosecond => amount.checked_div(1_000_000),
            Precision::Microsecond => amount.checked_div(1_000),
            Precision::Millisecond => Some(amount),
            Precision::Second => amount.checked_mul(1_000),
            Precision::Minute => amount.checked_mul(60_000),
            Precision::Hour => amount.checked_mul(3_600_000),
        }
    }
 }
 impl Display for Precision {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Precision::Nanosecond => write!(f, "Precision::Nanosecond"),
            Precision::Microsecond => write!(f, "Precision::Microsecond"),
            Precision::Millisecond => write!(f, "Precision::Millisecond"),
            Precision::Second => write!(f, "Precision::Second"),
            Precision::Minute => write!(f, "Precision::Minute"),
            Precision::Hour => write!(f, "Precision::Hour"),
        }
    }
 }
 impl TryFrom<Precision> for TimeUnit {
    type Error = Error;
    fn try_from(precision: Precision) -> Result<Self, Self::Error> {
        Ok(match precision {
            Precision::Second => TimeUnit::Second,
            Precision::Millisecond => TimeUnit::Millisecond,
            Precision::Microsecond => TimeUnit::Microsecond,
            Precision::Nanosecond => TimeUnit::Nanosecond,
            _ => {
                return Err(Error::NotSupported {
                    feat: format!("convert {precision} into TimeUnit"),
                })
            }
        })
    }
 }
 #[cfg(test)]
 mod tests {
    use crate::precision::Precision;
    #[test]
    fn test_to_nanos() {
        assert_eq!(Precision::Nanosecond.to_nanos(1).unwrap(), 1);
        assert_eq!(Precision::Microsecond.to_nanos(1).unwrap(), 1_000);
        assert_eq!(Precision::Millisecond.to_nanos(1).unwrap(), 1_000_000);
        assert_eq!(Precision::Second.to_nanos(1).unwrap(), 1_000_000_000);
        assert_eq!(Precision::Minute.to_nanos(1).unwrap(), 60 * 1_000_000_000);
        assert_eq!(
            Precision::Hour.to_nanos(1).unwrap(),
            60 * 60 * 1_000_000_000
        );
    }
    #[test]
    fn test_to_millis() {
        assert_eq!(Precision::Nanosecond.to_millis(1_000_000).unwrap(), 1);
        assert_eq!(Precision::Microsecond.to_millis(1_000).unwrap(), 1);
        assert_eq!(Precision::Millisecond.to_millis(1).unwrap(), 1);
        assert_eq!(Precision::Second.to_millis(1).unwrap(), 1_000);
        assert_eq!(Precision::Minute.to_millis(1).unwrap(), 60 * 1_000);
        assert_eq!(Precision::Hour.to_millis(1).unwrap(), 60 * 60 * 1_000);
    }
    #[test]
    fn test_to_nanos_basic() {
        assert_eq!(Precision::Second.to_nanos(1), Some(1_000_000_000));
        assert_eq!(Precision::Minute.to_nanos(1), Some(60 * 1_000_000_000));
    }
    #[test]
    fn test_to_millis_basic() {
        assert_eq!(Precision::Second.to_millis(1), Some(1_000));
        assert_eq!(Precision::Minute.to_millis(1), Some(60_000));
    }
    #[test]
    fn test_to_nanos_overflow() {
        assert_eq!(Precision::Hour.to_nanos(i64::MAX / 100), None);
    }
    #[test]
    fn test_zero_input() {
        assert_eq!(Precision::Second.to_nanos(0), Some(0));
        assert_eq!(Precision::Minute.to_millis(0), Some(0));
    }
 }
--- a/src/common/grpc/src/writer.rs
+++ b/src/common/grpc/src/writer.rs
@@ -0,0 +1,441 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::collections::HashMap;
 use std::fmt::Display;
 use api::helper::values_with_capacity;
 use api::v1::{Column, ColumnDataType, ColumnDataTypeExtension, SemanticType};
 use common_base::BitVec;
 use common_time::timestamp::TimeUnit;
 use snafu::ensure;
 use crate::error::{Result, TypeMismatchSnafu};
 use crate::Error;
 type ColumnName = String;
 type RowCount = u32;
 // TODO(fys): will remove in the future.
 #[derive(Default)]
 pub struct LinesWriter {
    column_name_index: HashMap<ColumnName, usize>,
    null_masks: Vec<BitVec>,
    batch: (Vec<Column>, RowCount),
    lines: usize,
 }
 impl LinesWriter {
    pub fn with_lines(lines: usize) -> Self {
        Self {
            lines,
            ..Default::default()
        }
    }
    pub fn write_ts(&mut self, column_name: &str, value: (i64, Precision)) -> Result<()> {
        let (idx, column) = self.mut_column(
            column_name,
            ColumnDataType::TimestampMillisecond,
            SemanticType::Timestamp,
            None,
        );
        ensure!(
            column.datatype == ColumnDataType::TimestampMillisecond as i32,
            TypeMismatchSnafu {
                column_name,
                expected: "timestamp",
                actual: format!("{:?}", column.datatype)
            }
        );
        // It is safe to use unwrap here, because values has been initialized in mut_column()
        let values = column.values.as_mut().unwrap();
        values
            .timestamp_millisecond_values
            .push(to_ms_ts(value.1, value.0));
        self.null_masks[idx].push(false);
        Ok(())
    }
    pub fn write_tag(&mut self, column_name: &str, value: &str) -> Result<()> {
        let (idx, column) =
            self.mut_column(column_name, ColumnDataType::String, SemanticType::Tag, None);
        ensure!(
            column.datatype == ColumnDataType::String as i32,
            TypeMismatchSnafu {
                column_name,
                expected: "string",
                actual: format!("{:?}", column.datatype)
            }
        );
        // It is safe to use unwrap here, because values has been initialized in mut_column()
        let values = column.values.as_mut().unwrap();
        values.string_values.push(value.to_string());
        self.null_masks[idx].push(false);
        Ok(())
    }
    pub fn write_u64(&mut self, column_name: &str, value: u64) -> Result<()> {
        let (idx, column) = self.mut_column(
            column_name,
            ColumnDataType::Uint64,
            SemanticType::Field,
            None,
        );
        ensure!(
            column.datatype == ColumnDataType::Uint64 as i32,
            TypeMismatchSnafu {
                column_name,
                expected: "u64",
                actual: format!("{:?}", column.datatype)
            }
        );
        // It is safe to use unwrap here, because values has been initialized in mut_column()
        let values = column.values.as_mut().unwrap();
        values.u64_values.push(value);
        self.null_masks[idx].push(false);
        Ok(())
    }
    pub fn write_i64(&mut self, column_name: &str, value: i64) -> Result<()> {
        let (idx, column) = self.mut_column(
            column_name,
            ColumnDataType::Int64,
            SemanticType::Field,
            None,
        );
        ensure!(
            column.datatype == ColumnDataType::Int64 as i32,
            TypeMismatchSnafu {
                column_name,
                expected: "i64",
                actual: format!("{:?}", column.datatype)
            }
        );
        // It is safe to use unwrap here, because values has been initialized in mut_column()
        let values = column.values.as_mut().unwrap();
        values.i64_values.push(value);
        self.null_masks[idx].push(false);
        Ok(())
    }
    pub fn write_f64(&mut self, column_name: &str, value: f64) -> Result<()> {
        let (idx, column) = self.mut_column(
            column_name,
            ColumnDataType::Float64,
            SemanticType::Field,
            None,
        );
        ensure!(
            column.datatype == ColumnDataType::Float64 as i32,
            TypeMismatchSnafu {
                column_name,
                expected: "f64",
                actual: format!("{:?}", column.datatype)
            }
        );
        // It is safe to use unwrap here, because values has been initialized in mut_column()
        let values = column.values.as_mut().unwrap();
        values.f64_values.push(value);
        self.null_masks[idx].push(false);
        Ok(())
    }
    pub fn write_string(&mut self, column_name: &str, value: &str) -> Result<()> {
        let (idx, column) = self.mut_column(
            column_name,
            ColumnDataType::String,
            SemanticType::Field,
            None,
        );
        ensure!(
            column.datatype == ColumnDataType::String as i32,
            TypeMismatchSnafu {
                column_name,
                expected: "string",
                actual: format!("{:?}", column.datatype)
            }
        );
        // It is safe to use unwrap here, because values has been initialized in mut_column()
        let values = column.values.as_mut().unwrap();
        values.string_values.push(value.to_string());
        self.null_masks[idx].push(false);
        Ok(())
    }
    pub fn write_bool(&mut self, column_name: &str, value: bool) -> Result<()> {
        let (idx, column) = self.mut_column(
            column_name,
            ColumnDataType::Boolean,
            SemanticType::Field,
            None,
        );
        ensure!(
            column.datatype == ColumnDataType::Boolean as i32,
            TypeMismatchSnafu {
                column_name,
                expected: "boolean",
                actual: format!("{:?}", column.datatype)
            }
        );
        // It is safe to use unwrap here, because values has been initialized in mut_column()
        let values = column.values.as_mut().unwrap();
        values.bool_values.push(value);
        self.null_masks[idx].push(false);
        Ok(())
    }
    pub fn commit(&mut self) {
        let batch = &mut self.batch;
        batch.1 += 1;
        for i in 0..batch.0.len() {
            let null_mask = &mut self.null_masks[i];
            if batch.1 as usize > null_mask.len() {
                null_mask.push(true);
            }
        }
    }
    pub fn finish(mut self) -> (Vec<Column>, RowCount) {
        let null_masks = self.null_masks;
        for (i, null_mask) in null_masks.into_iter().enumerate() {
            let columns = &mut self.batch.0;
            columns[i].null_mask = null_mask.into_vec();
        }
        self.batch
    }
    fn mut_column(
        &mut self,
        column_name: &str,
        datatype: ColumnDataType,
        semantic_type: SemanticType,
        datatype_extension: Option<ColumnDataTypeExtension>,
    ) -> (usize, &mut Column) {
        let column_names = &mut self.column_name_index;
        let column_idx = match column_names.get(column_name) {
            Some(i) => *i,
            None => {
                let new_idx = column_names.len();
                let batch = &mut self.batch;
                let to_insert = self.lines;
                let mut null_mask = BitVec::with_capacity(to_insert);
                null_mask.extend(BitVec::repeat(true, batch.1 as usize));
                self.null_masks.push(null_mask);
                batch.0.push(Column {
                    column_name: column_name.to_string(),
                    semantic_type: semantic_type.into(),
                    values: Some(values_with_capacity(datatype, to_insert)),
                    datatype: datatype as i32,
                    null_mask: Vec::default(),
                    datatype_extension,
                });
                let _ = column_names.insert(column_name.to_string(), new_idx);
                new_idx
            }
        };
        (column_idx, &mut self.batch.0[column_idx])
    }
 }
 pub fn to_ms_ts(p: Precision, ts: i64) -> i64 {
    match p {
        Precision::Nanosecond => ts / 1_000_000,
        Precision::Microsecond => ts / 1000,
        Precision::Millisecond => ts,
        Precision::Second => ts * 1000,
        Precision::Minute => ts * 1000 * 60,
        Precision::Hour => ts * 1000 * 60 * 60,
    }
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Precision {
    Nanosecond,
    Microsecond,
    Millisecond,
    Second,
    Minute,
    Hour,
 }
 impl Display for Precision {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Precision::Nanosecond => write!(f, "Precision::Nanosecond"),
            Precision::Microsecond => write!(f, "Precision::Microsecond"),
            Precision::Millisecond => write!(f, "Precision::Millisecond"),
            Precision::Second => write!(f, "Precision::Second"),
            Precision::Minute => write!(f, "Precision::Minute"),
            Precision::Hour => write!(f, "Precision::Hour"),
        }
    }
 }
 impl TryFrom<Precision> for TimeUnit {
    type Error = Error;
    fn try_from(precision: Precision) -> std::result::Result<Self, Self::Error> {
        Ok(match precision {
            Precision::Second => TimeUnit::Second,
            Precision::Millisecond => TimeUnit::Millisecond,
            Precision::Microsecond => TimeUnit::Microsecond,
            Precision::Nanosecond => TimeUnit::Nanosecond,
            _ => {
                return Err(Error::NotSupported {
                    feat: format!("convert {precision} into TimeUnit"),
                })
            }
        })
    }
 }
 #[cfg(test)]
 mod tests {
    use api::v1::{ColumnDataType, SemanticType};
    use common_base::BitVec;
    use super::LinesWriter;
    use crate::writer::{to_ms_ts, Precision};
    #[test]
    fn test_lines_writer() {
        let mut writer = LinesWriter::with_lines(3);
        writer.write_tag("host", "host1").unwrap();
        writer.write_f64("cpu", 0.5).unwrap();
        writer.write_f64("memory", 0.4).unwrap();
        writer.write_string("name", "name1").unwrap();
        writer
            .write_ts("ts", (101011000, Precision::Millisecond))
            .unwrap();
        writer.commit();
        writer.write_tag("host", "host2").unwrap();
        writer
            .write_ts("ts", (102011001, Precision::Millisecond))
            .unwrap();
        writer.write_bool("enable_reboot", true).unwrap();
        writer.write_u64("year_of_service", 2).unwrap();
        writer.write_i64("temperature", 4).unwrap();
        writer.commit();
        writer.write_tag("host", "host3").unwrap();
        writer.write_f64("cpu", 0.4).unwrap();
        writer.write_u64("cpu_core_num", 16).unwrap();
        writer
            .write_ts("ts", (103011002, Precision::Millisecond))
            .unwrap();
        writer.commit();
        let insert_batch = writer.finish();
        assert_eq!(3, insert_batch.1);
        let columns = insert_batch.0;
        assert_eq!(9, columns.len());
        let column = &columns[0];
        assert_eq!("host", columns[0].column_name);
        assert_eq!(ColumnDataType::String as i32, column.datatype);
        assert_eq!(SemanticType::Tag as i32, column.semantic_type);
        assert_eq!(
            vec!["host1", "host2", "host3"],
            column.values.as_ref().unwrap().string_values
        );
        verify_null_mask(&column.null_mask, vec![false, false, false]);
        let column = &columns[1];
        assert_eq!("cpu", column.column_name);
        assert_eq!(ColumnDataType::Float64 as i32, column.datatype);
        assert_eq!(SemanticType::Field as i32, column.semantic_type);
        assert_eq!(vec![0.5, 0.4], column.values.as_ref().unwrap().f64_values);
        verify_null_mask(&column.null_mask, vec![false, true, false]);
        let column = &columns[2];
        assert_eq!("memory", column.column_name);
        assert_eq!(ColumnDataType::Float64 as i32, column.datatype);
        assert_eq!(SemanticType::Field as i32, column.semantic_type);
        assert_eq!(vec![0.4], column.values.as_ref().unwrap().f64_values);
        verify_null_mask(&column.null_mask, vec![false, true, true]);
        let column = &columns[3];
        assert_eq!("name", column.column_name);
        assert_eq!(ColumnDataType::String as i32, column.datatype);
        assert_eq!(SemanticType::Field as i32, column.semantic_type);
        assert_eq!(vec!["name1"], column.values.as_ref().unwrap().string_values);
        verify_null_mask(&column.null_mask, vec![false, true, true]);
        let column = &columns[4];
        assert_eq!("ts", column.column_name);
        assert_eq!(ColumnDataType::TimestampMillisecond as i32, column.datatype);
        assert_eq!(SemanticType::Timestamp as i32, column.semantic_type);
        assert_eq!(
            vec![101011000, 102011001, 103011002],
            column.values.as_ref().unwrap().timestamp_millisecond_values
        );
        verify_null_mask(&column.null_mask, vec![false, false, false]);
        let column = &columns[5];
        assert_eq!("enable_reboot", column.column_name);
        assert_eq!(ColumnDataType::Boolean as i32, column.datatype);
        assert_eq!(SemanticType::Field as i32, column.semantic_type);
        assert_eq!(vec![true], column.values.as_ref().unwrap().bool_values);
        verify_null_mask(&column.null_mask, vec![true, false, true]);
        let column = &columns[6];
        assert_eq!("year_of_service", column.column_name);
        assert_eq!(ColumnDataType::Uint64 as i32, column.datatype);
        assert_eq!(SemanticType::Field as i32, column.semantic_type);
        assert_eq!(vec![2], column.values.as_ref().unwrap().u64_values);
        verify_null_mask(&column.null_mask, vec![true, false, true]);
        let column = &columns[7];
        assert_eq!("temperature", column.column_name);
        assert_eq!(ColumnDataType::Int64 as i32, column.datatype);
        assert_eq!(SemanticType::Field as i32, column.semantic_type);
        assert_eq!(vec![4], column.values.as_ref().unwrap().i64_values);
        verify_null_mask(&column.null_mask, vec![true, false, true]);
        let column = &columns[8];
        assert_eq!("cpu_core_num", column.column_name);
        assert_eq!(ColumnDataType::Uint64 as i32, column.datatype);
        assert_eq!(SemanticType::Field as i32, column.semantic_type);
        assert_eq!(vec![16], column.values.as_ref().unwrap().u64_values);
        verify_null_mask(&column.null_mask, vec![true, true, false]);
    }
    fn verify_null_mask(data: &[u8], expected: Vec<bool>) {
        let bitvec = BitVec::from_slice(data);
        for (idx, b) in expected.iter().enumerate() {
            assert_eq!(b, bitvec.get(idx).unwrap())
        }
    }
    #[test]
    fn test_to_ms() {
        assert_eq!(100, to_ms_ts(Precision::Nanosecond, 100110000));
        assert_eq!(100110, to_ms_ts(Precision::Microsecond, 100110000));
        assert_eq!(100110000, to_ms_ts(Precision::Millisecond, 100110000));
        assert_eq!(
            100110000 * 1000 * 60,
            to_ms_ts(Precision::Minute, 100110000)
        );
        assert_eq!(
            100110000 * 1000 * 60 * 60,
            to_ms_ts(Precision::Hour, 100110000)
        );
    }
 }
--- a/src/common/meta/src/ddl/alter_table/update_metadata.rs
+++ b/src/common/meta/src/ddl/alter_table/update_metadata.rs
@@ -51,7 +51,7 @@ impl AlterTableProcedure {
            AlterKind::RenameTable { new_table_name } => {
                new_info.name = new_table_name.to_string();
            }
-            AlterKind::DropColumns { .. } | AlterKind::ChangeColumnTypes { .. } => {}
+            AlterKind::DropColumns { .. } => {}
        }
        Ok(new_info)
--- a/src/common/meta/src/ddl/create_table.rs
+++ b/src/common/meta/src/ddl/create_table.rs
@@ -271,7 +271,7 @@ impl CreateTableProcedure {
    ///
    /// Abort(not-retry):
    /// - Failed to create table metadata.
-    async fn on_create_metadata(&mut self) -> Result<Status> {
+    async fn on_create_metadata(&self) -> Result<Status> {
        let table_id = self.table_id();
        let manager = &self.context.table_metadata_manager;
@@ -285,7 +285,6 @@ impl CreateTableProcedure {
            .await?;
        info!("Created table metadata for table {table_id}");
        self.creator.opening_regions.clear();
        Ok(Status::done_with_output(table_id))
    }
 }
@@ -386,7 +385,7 @@ impl TableCreator {
    }
 }
-#[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, AsRefStr)]
 pub enum CreateTableState {
    /// Prepares to create the table
    Prepare,
--- a/src/common/meta/src/ddl/drop_database/cursor.rs
+++ b/src/common/meta/src/ddl/drop_database/cursor.rs
@@ -165,7 +165,7 @@ mod tests {
    async fn test_next_without_logical_tables() {
        let datanode_manager = Arc::new(MockDatanodeManager::new(()));
        let ddl_context = new_ddl_context(datanode_manager);
-        create_physical_table(&ddl_context, 0, "phy").await;
+        create_physical_table(ddl_context.clone(), 0, "phy").await;
        // It always starts from Logical
        let mut state = DropDatabaseCursor::new(DropTableTarget::Logical);
        let mut ctx = DropDatabaseContext {
@@ -199,7 +199,7 @@ mod tests {
    async fn test_next_with_logical_tables() {
        let datanode_manager = Arc::new(MockDatanodeManager::new(()));
        let ddl_context = new_ddl_context(datanode_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
+        let physical_table_id = create_physical_table(ddl_context.clone(), 0, "phy").await;
        create_logical_table(ddl_context.clone(), 0, physical_table_id, "metric_0").await;
        // It always starts from Logical
        let mut state = DropDatabaseCursor::new(DropTableTarget::Logical);
--- a/src/common/meta/src/ddl/drop_database/executor.rs
+++ b/src/common/meta/src/ddl/drop_database/executor.rs
@@ -161,7 +161,7 @@ mod tests {
    async fn test_next_with_physical_table() {
        let datanode_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
        let ddl_context = new_ddl_context(datanode_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
+        let physical_table_id = create_physical_table(ddl_context.clone(), 0, "phy").await;
        let (_, table_route) = ddl_context
            .table_metadata_manager
            .table_route_manager()
@@ -211,7 +211,7 @@ mod tests {
    async fn test_next_logical_table() {
        let datanode_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
        let ddl_context = new_ddl_context(datanode_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
+        let physical_table_id = create_physical_table(ddl_context.clone(), 0, "phy").await;
        create_logical_table(ddl_context.clone(), 0, physical_table_id, "metric").await;
        let logical_table_id = physical_table_id + 1;
        let (_, table_route) = ddl_context
@@ -315,7 +315,7 @@ mod tests {
    async fn test_next_retryable_err() {
        let datanode_manager = Arc::new(MockDatanodeManager::new(RetryErrorDatanodeHandler));
        let ddl_context = new_ddl_context(datanode_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
+        let physical_table_id = create_physical_table(ddl_context.clone(), 0, "phy").await;
        let (_, table_route) = ddl_context
            .table_metadata_manager
            .table_route_manager()
--- a/src/common/meta/src/ddl/drop_table.rs
+++ b/src/common/meta/src/ddl/drop_table.rs
@@ -46,7 +46,7 @@ pub struct DropTableProcedure {
    /// The serializable data.
    pub data: DropTableData,
    /// The guards of opening regions.
-    pub(crate) dropping_regions: Vec<OperatingRegionGuard>,
+    pub dropping_regions: Vec<OperatingRegionGuard>,
    /// The drop table executor.
    executor: DropTableExecutor,
 }
@@ -153,7 +153,7 @@ impl DropTableProcedure {
    }
    /// Deletes metadata tombstone.
-    async fn on_delete_metadata_tombstone(&mut self) -> Result<Status> {
+    async fn on_delete_metadata_tombstone(&self) -> Result<Status> {
        let table_route_value = &TableRouteValue::new(
            self.data.task.table_id,
            // Safety: checked
@@ -163,8 +163,6 @@ impl DropTableProcedure {
        self.executor
            .on_delete_metadata_tombstone(&self.context, table_route_value)
            .await?;
        self.dropping_regions.clear();
        Ok(Status::done())
    }
 }
@@ -268,7 +266,7 @@ impl DropTableData {
 }
 /// The state of drop table.
-#[derive(Debug, Serialize, Deserialize, AsRefStr, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, AsRefStr)]
 pub enum DropTableState {
    /// Prepares to drop the table
    Prepare,
--- a/src/common/meta/src/ddl/physical_table_metadata.rs
+++ b/src/common/meta/src/ddl/physical_table_metadata.rs
@@ -52,9 +52,5 @@ pub(crate) fn build_new_physical_table_info(
        columns.push(col.column_schema.clone());
    }
    if let Some(time_index) = *time_index {
        raw_table_info.meta.schema.column_schemas[time_index].set_time_index();
    }
    raw_table_info
 }
--- a/src/common/meta/src/ddl/test_util.rs
+++ b/src/common/meta/src/ddl/test_util.rs
@@ -47,7 +47,7 @@ pub async fn create_physical_table_metadata(
 }
 pub async fn create_physical_table(
-    ddl_context: &DdlContext,
+    ddl_context: DdlContext,
    cluster_id: ClusterId,
    name: &str,
 ) -> TableId {
@@ -67,7 +67,7 @@ pub async fn create_physical_table(
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
    create_physical_table_metadata(
-        ddl_context,
+        &ddl_context,
        create_physical_table_task.table_info.clone(),
        TableRouteValue::Physical(table_route),
    )
@@ -81,7 +81,7 @@ pub async fn create_logical_table(
    cluster_id: ClusterId,
    physical_table_id: TableId,
    table_name: &str,
-) -> TableId {
+) {
    use std::assert_matches::assert_matches;
    let tasks = vec![test_create_logical_table_task(table_name)];
@@ -91,14 +91,6 @@ pub async fn create_logical_table(
    assert_matches!(status, Status::Executing { persist: true });
    let status = procedure.on_create_metadata().await.unwrap();
    assert_matches!(status, Status::Done { .. });
    let Status::Done {
        output: Some(output),
    } = status
    else {
        panic!("Unexpected status: {:?}", status);
    };
    output.downcast_ref::<Vec<u32>>().unwrap()[0]
 }
 pub fn test_create_logical_table_task(name: &str) -> CreateTableTask {
--- a/src/common/meta/src/ddl/tests/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/tests/alter_logical_tables.rs
@@ -128,9 +128,9 @@ async fn test_on_prepare_different_physical_table() {
    let datanode_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(datanode_manager);
-    let phy1_id = create_physical_table(&ddl_context, cluster_id, "phy1").await;
+    let phy1_id = create_physical_table(ddl_context.clone(), cluster_id, "phy1").await;
    create_logical_table(ddl_context.clone(), cluster_id, phy1_id, "table1").await;
-    let phy2_id = create_physical_table(&ddl_context, cluster_id, "phy2").await;
+    let phy2_id = create_physical_table(ddl_context.clone(), cluster_id, "phy2").await;
    create_logical_table(ddl_context.clone(), cluster_id, phy2_id, "table2").await;
    let tasks = vec![
@@ -150,7 +150,7 @@ async fn test_on_prepare_logical_table_not_exists() {
    let ddl_context = new_ddl_context(datanode_manager);
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(ddl_context.clone(), cluster_id, "phy").await;
    // Creates 3 logical tables
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
@@ -172,7 +172,7 @@ async fn test_on_prepare() {
    let ddl_context = new_ddl_context(datanode_manager);
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(ddl_context.clone(), cluster_id, "phy").await;
    // Creates 3 logical tables
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
@@ -196,7 +196,7 @@ async fn test_on_update_metadata() {
    let ddl_context = new_ddl_context(datanode_manager);
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(ddl_context.clone(), cluster_id, "phy").await;
    // Creates 3 logical tables
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
@@ -233,7 +233,7 @@ async fn test_on_part_duplicate_alter_request() {
    let ddl_context = new_ddl_context(datanode_manager);
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(ddl_context.clone(), cluster_id, "phy").await;
    // Creates 3 logical tables
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
--- a/src/common/meta/src/ddl/tests/create_table.rs
+++ b/src/common/meta/src/ddl/tests/create_table.rs
@@ -21,12 +21,9 @@ use api::v1::{ColumnDataType, SemanticType};
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId, Status};
-use common_procedure_test::{
+use common_procedure_test::MockContextProvider;
    execute_procedure_until, execute_procedure_until_done, MockContextProvider,
 };
 use store_api::storage::RegionId;
-use crate::ddl::create_table::{CreateTableProcedure, CreateTableState};
+use crate::ddl::create_table::CreateTableProcedure;
 use crate::ddl::test_util::columns::TestColumnDefBuilder;
 use crate::ddl::test_util::create_table::{
    build_raw_table_info_from_expr, TestCreateTableExprBuilder,
@@ -36,9 +33,8 @@ use crate::ddl::test_util::datanode_handler::{
 };
 use crate::error::Error;
 use crate::key::table_route::TableRouteValue;
 use crate::kv_backend::memory::MemoryKvBackend;
 use crate::rpc::ddl::CreateTableTask;
-use crate::test_util::{new_ddl_context, new_ddl_context_with_kv_backend, MockDatanodeManager};
+use crate::test_util::{new_ddl_context, MockDatanodeManager};
 fn test_create_table_task(name: &str) -> CreateTableTask {
    let create_table = TestCreateTableExprBuilder::default()
@@ -248,39 +244,3 @@ async fn test_on_create_metadata() {
    let table_id = status.downcast_output_ref::<u32>().unwrap();
    assert_eq!(*table_id, 1024);
 }
 #[tokio::test]
 async fn test_memory_region_keeper_guard_dropped_on_procedure_done() {
    let cluster_id = 1;
    let datanode_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let kv_backend = Arc::new(MemoryKvBackend::new());
    let ddl_context = new_ddl_context_with_kv_backend(datanode_manager, kv_backend);
    let task = test_create_table_task("foo");
    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context.clone());
    execute_procedure_until(&mut procedure, |p| {
        p.creator.data.state == CreateTableState::CreateMetadata
    })
    .await;
    // Ensure that after running to the state `CreateMetadata`(just past `DatanodeCreateRegions`),
    // the opening regions should be recorded:
    let guards = &procedure.creator.opening_regions;
    assert_eq!(guards.len(), 1);
    let (datanode_id, region_id) = (0, RegionId::new(procedure.table_id(), 0));
    assert_eq!(guards[0].info(), (datanode_id, region_id));
    assert!(ddl_context
        .memory_region_keeper
        .contains(datanode_id, region_id));
    execute_procedure_until_done(&mut procedure).await;
    // Ensure that when run to the end, the opening regions should be cleared:
    let guards = &procedure.creator.opening_regions;
    assert!(guards.is_empty());
    assert!(!ddl_context
        .memory_region_keeper
        .contains(datanode_id, region_id));
 }
--- a/src/common/meta/src/ddl/tests/drop_database.rs
+++ b/src/common/meta/src/ddl/tests/drop_database.rs
@@ -42,7 +42,7 @@ async fn test_drop_database_with_logical_tables() {
        .await
        .unwrap();
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(ddl_context.clone(), cluster_id, "phy").await;
    // Creates 3 logical tables
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
@@ -91,7 +91,7 @@ async fn test_drop_database_retryable_error() {
        .await
        .unwrap();
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(ddl_context.clone(), cluster_id, "phy").await;
    // Creates 3 logical tables
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
--- a/src/common/meta/src/ddl/tests/drop_table.rs
+++ b/src/common/meta/src/ddl/tests/drop_table.rs
@@ -19,21 +19,17 @@ use api::v1::region::{region_request, RegionRequest};
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
-use common_procedure::Procedure;
+use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId};
-use common_procedure_test::{
+use common_procedure_test::MockContextProvider;
    execute_procedure_until, execute_procedure_until_done, new_test_procedure_context,
 };
 use store_api::storage::RegionId;
 use table::metadata::TableId;
 use tokio::sync::mpsc;
 use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
-use crate::ddl::drop_table::{DropTableProcedure, DropTableState};
+use crate::ddl::drop_table::DropTableProcedure;
 use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::datanode_handler::{DatanodeWatcher, NaiveDatanodeHandler};
 use crate::ddl::test_util::{
-    create_logical_table, create_physical_table, create_physical_table_metadata,
+    create_physical_table_metadata, test_create_logical_table_task, test_create_physical_table_task,
    test_create_logical_table_task, test_create_physical_table_task,
 };
 use crate::ddl::{TableMetadata, TableMetadataAllocatorContext};
 use crate::key::table_route::TableRouteValue;
@@ -62,7 +58,14 @@ async fn test_on_prepare_table_not_exists_err() {
        .await
        .unwrap();
-    let task = new_drop_table_task("bar", table_id, false);
+    let task = DropTableTask {
        catalog: DEFAULT_CATALOG_NAME.to_string(),
        schema: DEFAULT_SCHEMA_NAME.to_string(),
        table: "bar".to_string(),
        table_id,
        drop_if_exists: false,
    };
    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_eq!(err.status_code(), StatusCode::TableNotFound);
@@ -87,12 +90,26 @@ async fn test_on_prepare_table() {
        .await
        .unwrap();
-    let task = new_drop_table_task("bar", table_id, true);
+    let task = DropTableTask {
        catalog: DEFAULT_CATALOG_NAME.to_string(),
        schema: DEFAULT_SCHEMA_NAME.to_string(),
        table: "bar".to_string(),
        table_id,
        drop_if_exists: true,
    };
    // Drop if exists
    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
    procedure.on_prepare().await.unwrap();
-    let task = new_drop_table_task(table_name, table_id, false);
+    let task = DropTableTask {
        catalog: DEFAULT_CATALOG_NAME.to_string(),
        schema: DEFAULT_SCHEMA_NAME.to_string(),
        table: table_name.to_string(),
        table_id,
        drop_if_exists: false,
    };
    // Drop table
    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
    procedure.on_prepare().await.unwrap();
@@ -141,7 +158,13 @@ async fn test_on_datanode_drop_regions() {
        .await
        .unwrap();
-    let task = new_drop_table_task(table_name, table_id, false);
+    let task = DropTableTask {
        catalog: DEFAULT_CATALOG_NAME.to_string(),
        schema: DEFAULT_SCHEMA_NAME.to_string(),
        table: table_name.to_string(),
        table_id,
        drop_if_exists: false,
    };
    // Drop table
    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
    procedure.on_prepare().await.unwrap();
@@ -211,7 +234,10 @@ async fn test_on_rollback() {
        ddl_context.clone(),
    );
    procedure.on_prepare().await.unwrap();
-    let ctx = new_test_procedure_context();
+    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
        provider: Arc::new(MockContextProvider::default()),
    };
    procedure.execute(&ctx).await.unwrap();
    // Triggers procedure to create table metadata
    let status = procedure.execute(&ctx).await.unwrap();
@@ -221,10 +247,20 @@ async fn test_on_rollback() {
    let expected_kvs = kv_backend.dump();
    // Drops the physical table
    {
-        let task = new_drop_table_task("phy_table", physical_table_id, false);
+        let task = DropTableTask {
            catalog: DEFAULT_CATALOG_NAME.to_string(),
            schema: DEFAULT_SCHEMA_NAME.to_string(),
            table: "phy_table".to_string(),
            table_id: physical_table_id,
            drop_if_exists: false,
        };
        let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
        procedure.on_prepare().await.unwrap();
        procedure.on_delete_metadata().await.unwrap();
        let ctx = ProcedureContext {
            procedure_id: ProcedureId::random(),
            provider: Arc::new(MockContextProvider::default()),
        };
        procedure.rollback(&ctx).await.unwrap();
        // Rollback again
        procedure.rollback(&ctx).await.unwrap();
@@ -233,66 +269,23 @@ async fn test_on_rollback() {
    }
    // Drops the logical table
-    let task = new_drop_table_task("foo", table_ids[0], false);
+    let task = DropTableTask {
        catalog: DEFAULT_CATALOG_NAME.to_string(),
        schema: DEFAULT_SCHEMA_NAME.to_string(),
        table: "foo".to_string(),
        table_id: table_ids[0],
        drop_if_exists: false,
    };
    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
    procedure.on_prepare().await.unwrap();
    procedure.on_delete_metadata().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
        provider: Arc::new(MockContextProvider::default()),
    };
    procedure.rollback(&ctx).await.unwrap();
    // Rollback again
    procedure.rollback(&ctx).await.unwrap();
    let kvs = kv_backend.dump();
    assert_eq!(kvs, expected_kvs);
 }
 fn new_drop_table_task(table_name: &str, table_id: TableId, drop_if_exists: bool) -> DropTableTask {
    DropTableTask {
        catalog: DEFAULT_CATALOG_NAME.to_string(),
        schema: DEFAULT_SCHEMA_NAME.to_string(),
        table: table_name.to_string(),
        table_id,
        drop_if_exists,
    }
 }
 #[tokio::test]
 async fn test_memory_region_keeper_guard_dropped_on_procedure_done() {
    let cluster_id = 1;
    let datanode_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let kv_backend = Arc::new(MemoryKvBackend::new());
    let ddl_context = new_ddl_context_with_kv_backend(datanode_manager, kv_backend);
    let physical_table_id = create_physical_table(&ddl_context, cluster_id, "t").await;
    let logical_table_id =
        create_logical_table(ddl_context.clone(), cluster_id, physical_table_id, "s").await;
    let inner_test = |task: DropTableTask| async {
        let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
        execute_procedure_until(&mut procedure, |p| {
            p.data.state == DropTableState::InvalidateTableCache
        })
        .await;
        // Ensure that after running to the state `InvalidateTableCache`(just past `DeleteMetadata`),
        // the dropping regions should be recorded:
        let guards = &procedure.dropping_regions;
        assert_eq!(guards.len(), 1);
        let (datanode_id, region_id) = (0, RegionId::new(physical_table_id, 0));
        assert_eq!(guards[0].info(), (datanode_id, region_id));
        assert!(ddl_context
            .memory_region_keeper
            .contains(datanode_id, region_id));
        execute_procedure_until_done(&mut procedure).await;
        // Ensure that when run to the end, the dropping regions should be cleared:
        let guards = &procedure.dropping_regions;
        assert!(guards.is_empty());
        assert!(!ddl_context
            .memory_region_keeper
            .contains(datanode_id, region_id));
    };
    inner_test(new_drop_table_task("s", logical_table_id, false)).await;
    inner_test(new_drop_table_task("t", physical_table_id, false)).await;
 }
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -258,7 +258,7 @@ pub enum Error {
        error: Utf8Error,
    },
-    #[snafu(display("Table not found: '{}'", table_name))]
+    #[snafu(display("Table nod found, table: {}", table_name))]
    TableNotFound {
        table_name: String,
        location: Location,
--- a/src/common/meta/src/key/catalog_name.rs
+++ b/src/common/meta/src/key/catalog_name.rs
@@ -17,6 +17,7 @@ use std::sync::Arc;
 use common_catalog::consts::DEFAULT_CATALOG_NAME;
 use futures::stream::BoxStream;
 use futures::StreamExt;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};
@@ -83,11 +84,11 @@ impl<'a> TryFrom<&'a str> for CatalogNameKey<'a> {
 }
 /// Decoder `KeyValue` to ({catalog},())
-pub fn catalog_decoder(kv: KeyValue) -> Result<String> {
+pub fn catalog_decoder(kv: KeyValue) -> Result<(String, ())> {
    let str = std::str::from_utf8(&kv.key).context(error::ConvertRawKeySnafu)?;
    let catalog_name = CatalogNameKey::try_from(str)?;
-    Ok(catalog_name.catalog.to_string())
+    Ok((catalog_name.catalog.to_string(), ()))
 }
 pub struct CatalogManager {
@@ -133,7 +134,7 @@ impl CatalogManager {
            Arc::new(catalog_decoder),
        );
-        Box::pin(stream)
+        Box::pin(stream.map(|kv| kv.map(|kv| kv.0)))
    }
 }
--- a/src/common/meta/src/key/datanode_table.rs
+++ b/src/common/meta/src/key/datanode_table.rs
@@ -16,6 +16,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use futures::stream::BoxStream;
 use futures::StreamExt;
 use serde::{Deserialize, Serialize};
 use snafu::OptionExt;
 use store_api::storage::RegionNumber;
@@ -125,8 +126,10 @@ impl DatanodeTableValue {
 }
 /// Decodes `KeyValue` to ((),`DatanodeTableValue`)
-pub fn datanode_table_value_decoder(kv: KeyValue) -> Result<DatanodeTableValue> {
+pub fn datanode_table_value_decoder(kv: KeyValue) -> Result<((), DatanodeTableValue)> {
-    DatanodeTableValue::try_from_raw_value(&kv.value)
+    let value = DatanodeTableValue::try_from_raw_value(&kv.value)?;
    Ok(((), value))
 }
 pub struct DatanodeTableManager {
@@ -160,7 +163,7 @@ impl DatanodeTableManager {
            Arc::new(datanode_table_value_decoder),
        );
-        Box::pin(stream)
+        Box::pin(stream.map(|kv| kv.map(|kv| kv.1)))
    }
    /// Builds the create datanode table transactions. It only executes while the primary keys comparing successes.
--- a/src/common/meta/src/key/schema_name.rs
+++ b/src/common/meta/src/key/schema_name.rs
@@ -19,6 +19,7 @@ use std::time::Duration;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use futures::stream::BoxStream;
 use futures::StreamExt;
 use humantime_serde::re::humantime;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};
@@ -102,11 +103,11 @@ impl TableMetaKey for SchemaNameKey<'_> {
 }
 /// Decodes `KeyValue` to ({schema},())
-pub fn schema_decoder(kv: KeyValue) -> Result<String> {
+pub fn schema_decoder(kv: KeyValue) -> Result<(String, ())> {
    let str = std::str::from_utf8(&kv.key).context(error::ConvertRawKeySnafu)?;
    let schema_name = SchemaNameKey::try_from(str)?;
-    Ok(schema_name.schema.to_string())
+    Ok((schema_name.schema.to_string(), ()))
 }
 impl<'a> TryFrom<&'a str> for SchemaNameKey<'a> {
@@ -192,7 +193,7 @@ impl SchemaManager {
            Arc::new(schema_decoder),
        );
-        Box::pin(stream)
+        Box::pin(stream.map(|kv| kv.map(|kv| kv.0)))
    }
 }
--- a/src/common/meta/src/range_stream.rs
+++ b/src/common/meta/src/range_stream.rs
@@ -28,13 +28,13 @@ use crate::rpc::store::{RangeRequest, RangeResponse};
 use crate::rpc::KeyValue;
 use crate::util::get_next_prefix_key;
-pub type KeyValueDecoderFn<T> = dyn Fn(KeyValue) -> Result<T> + Send + Sync;
+pub type KeyValueDecoderFn<K, V> = dyn Fn(KeyValue) -> Result<(K, V)> + Send + Sync;
-enum PaginationStreamState<T> {
+enum PaginationStreamState<K, V> {
    /// At the start of reading.
    Init,
    /// Decoding key value pairs.
-    Decoding(SimpleKeyValueDecoder<T>),
+    Decoding(SimpleKeyValueDecoder<K, V>),
    /// Retrieving data from backend.
    Reading(BoxFuture<'static, Result<(PaginationStreamFactory, Option<RangeResponse>)>>),
    /// Error
@@ -77,7 +77,7 @@ struct PaginationStreamFactory {
 }
 impl PaginationStreamFactory {
-    fn new(
+    pub fn new(
        kv: &KvBackendRef,
        key: Vec<u8>,
        range_end: Vec<u8>,
@@ -137,7 +137,7 @@ impl PaginationStreamFactory {
        }
    }
-    async fn read_next(mut self) -> Result<(Self, Option<RangeResponse>)> {
+    pub async fn read_next(mut self) -> Result<(Self, Option<RangeResponse>)> {
        if self.more {
            let resp = self
                .adaptive_range(RangeRequest {
@@ -174,19 +174,18 @@ impl PaginationStreamFactory {
    }
 }
-pub struct PaginationStream<T> {
+pub struct PaginationStream<K, V> {
-    state: PaginationStreamState<T>,
+    state: PaginationStreamState<K, V>,
-    decoder_fn: Arc<KeyValueDecoderFn<T>>,
+    decoder_fn: Arc<KeyValueDecoderFn<K, V>>,
    factory: Option<PaginationStreamFactory>,
 }
-impl<T> PaginationStream<T> {
+impl<K, V> PaginationStream<K, V> {
    /// Returns a new [PaginationStream].
    pub fn new(
        kv: KvBackendRef,
        req: RangeRequest,
        page_size: usize,
-        decoder_fn: Arc<KeyValueDecoderFn<T>>,
+        decoder_fn: Arc<KeyValueDecoderFn<K, V>>,
    ) -> Self {
        Self {
            state: PaginationStreamState::Init,
@@ -203,13 +202,13 @@ impl<T> PaginationStream<T> {
    }
 }
-struct SimpleKeyValueDecoder<T> {
+struct SimpleKeyValueDecoder<K, V> {
    kv: VecDeque<KeyValue>,
-    decoder: Arc<KeyValueDecoderFn<T>>,
+    decoder: Arc<KeyValueDecoderFn<K, V>>,
 }
-impl<T> Iterator for SimpleKeyValueDecoder<T> {
+impl<K, V> Iterator for SimpleKeyValueDecoder<K, V> {
-    type Item = Result<T>;
+    type Item = Result<(K, V)>;
    fn next(&mut self) -> Option<Self::Item> {
        if let Some(kv) = self.kv.pop_front() {
@@ -220,8 +219,8 @@ impl<T> Iterator for SimpleKeyValueDecoder<T> {
    }
 }
-impl<T> Stream for PaginationStream<T> {
+impl<K, V> Stream for PaginationStream<K, V> {
-    type Item = Result<T>;
+    type Item = Result<(K, V)>;
    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
        loop {
--- a/src/common/procedure-test/src/lib.rs
+++ b/src/common/procedure-test/src/lib.rs
@@ -114,29 +114,3 @@ pub async fn execute_until_suspended_or_done(
    None
 }
 pub fn new_test_procedure_context() -> Context {
    Context {
        procedure_id: ProcedureId::random(),
        provider: Arc::new(MockContextProvider::default()),
    }
 }
 pub async fn execute_procedure_until<P: Procedure>(procedure: &mut P, until: impl Fn(&P) -> bool) {
    let mut reached = false;
    let context = new_test_procedure_context();
    while !matches!(
        procedure.execute(&context).await.unwrap(),
        Status::Done { .. }
    ) {
        if until(procedure) {
            reached = true;
            break;
        }
    }
    assert!(
        reached,
        "procedure '{}' did not reach the expected state",
        procedure.type_name()
    );
 }
--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use std::fmt::{Debug, Display, Formatter};
+use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 use api::greptime_proto::v1::add_column_location::LocationType;
@@ -126,17 +126,6 @@ pub enum AddColumnLocation {
    After { column_name: String },
 }
 impl Display for AddColumnLocation {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
            AddColumnLocation::First => write!(f, r#"FIRST"#),
            AddColumnLocation::After { column_name } => {
                write!(f, r#"AFTER {column_name}"#)
            }
        }
    }
 }
 impl From<&AddColumnLocation> for Location {
    fn from(value: &AddColumnLocation) -> Self {
        match value {
--- a/src/common/version/src/lib.rs
+++ b/src/common/version/src/lib.rs
@@ -103,28 +103,3 @@ pub fn setup_build_info() {
    println!("cargo:rustc-env=RUSTC_VERSION={}", build_info.rustc);
    println!("cargo:rustc-env=SOURCE_TIMESTAMP={}", build_info.timestamp);
 }
 /// Get the string for the output of cli "--version".
 #[macro_export]
 macro_rules! version {
    () => {
        concat!(
            "\nbranch: ",
            env!("GIT_BRANCH"),
            "\ncommit: ",
            env!("GIT_COMMIT"),
            "\ndirty: ",
            env!("GIT_DIRTY"),
            "\nversion: ",
            env!("CARGO_PKG_VERSION")
        )
    };
 }
 /// Short version for reporting metrics.
 #[macro_export]
 macro_rules! short_version {
    () => {
        concat!(env!("GIT_BRANCH"), "-", env!("GIT_COMMIT_SHORT"))
    };
 }
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -54,10 +54,6 @@ impl fmt::Debug for ColumnSchema {
            if self.is_nullable { "null" } else { "not null" },
        )?;
        if self.is_time_index {
            write!(f, " time_index")?;
        }
        // Add default constraint if present
        if let Some(default_constraint) = &self.default_constraint {
            write!(f, " default={:?}", default_constraint)?;
@@ -163,14 +159,6 @@ impl ColumnSchema {
        self.is_nullable = true;
    }
    /// Set the `is_time_index` to `true` of the column.
    /// Similar to [with_time_index] but don't take the ownership.
    ///
    /// [with_time_index]: Self::with_time_index
    pub fn set_time_index(&mut self) {
        self.is_time_index = true;
    }
    /// Creates a new [`ColumnSchema`] with given metadata.
    pub fn with_metadata(mut self, metadata: Metadata) -> Self {
        self.metadata = metadata;
--- a/src/datatypes/src/vectors/operations/cast.rs
+++ b/src/datatypes/src/vectors/operations/cast.rs
@@ -207,21 +207,4 @@ mod tests {
            assert!(c.is_null(2));
        }
    }
    #[test]
    fn test_safe_cast_to_null() {
        let string_vector = Arc::new(StringVector::from(vec![
            Some("1"),
            Some("hello"),
            Some(&i64::MAX.to_string()),
            None,
        ])) as VectorRef;
        let to_type = ConcreteDataType::int32_datatype();
        let b = string_vector.cast(&to_type).unwrap();
        let c = b.as_any().downcast_ref::<Int32Vector>().unwrap();
        assert_eq!(Value::Int32(1), c.get(0));
        assert_eq!(Value::Null, c.get(1));
        assert_eq!(Value::Null, c.get(2));
        assert_eq!(Value::Null, c.get(3));
    }
 }
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -14,14 +14,14 @@ common-error.workspace = true
 common-macro.workspace = true
 common-telemetry.workspace = true
 common-time.workspace = true
 datafusion-common.workspace = true
 datafusion-expr.workspace = true
 datafusion-substrait.workspace = true
 datatypes.workspace = true
 enum_dispatch = "0.3"
 # This fork is simply for keeping our dependency in our org, and pin the version
 # it is the same with upstream repo
-hydroflow = { git = "https://github.com/GreptimeTeam/hydroflow.git", branch = "main" }
+datafusion-common.workspace = true
 datafusion-expr.workspace = true
 hydroflow = { git = "https://github.com/GreptimeTeam/hydroflow.git", rev = "ba2df44efd42b7c4d37ebefbf82e77c6f1d4cb94" }
 itertools.workspace = true
 num-traits = "0.2"
 serde.workspace = true
--- a/src/flow/src/compute/render.rs
+++ b/src/flow/src/compute/render.rs
@@ -18,33 +18,25 @@
 use std::cell::RefCell;
 use std::collections::{BTreeMap, VecDeque};
 use std::ops::Range;
 use std::rc::Rc;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::value::{ListValue, Value};
 use hydroflow::futures::SinkExt;
 use hydroflow::lattices::cc_traits::Get;
 use hydroflow::scheduled::graph::Hydroflow;
 use hydroflow::scheduled::graph_ext::GraphExt;
 use hydroflow::scheduled::port::{PortCtx, SEND};
 use itertools::Itertools;
-use snafu::{ensure, OptionExt, ResultExt};
+use snafu::{OptionExt, ResultExt};
 use super::state::Scheduler;
-use crate::adapter::error::{Error, EvalSnafu, InvalidQuerySnafu, NotImplementedSnafu, PlanSnafu};
+use crate::adapter::error::{Error, EvalSnafu, InvalidQuerySnafu};
 use crate::compute::state::DataflowState;
 use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff};
 use crate::expr::error::{DataTypeSnafu, InternalSnafu};
 use crate::expr::{
    self, EvalError, GlobalId, LocalId, MapFilterProject, MfpPlan, SafeMfpPlan, ScalarExpr,
 };
-use crate::plan::{AccumulablePlan, KeyValPlan, Plan, ReducePlan};
+use crate::plan::Plan;
 use crate::repr::{self, DiffRow, KeyValDiffRow, Row};
-use crate::utils::{ArrangeHandler, ArrangeReader, ArrangeWriter, Arrangement};
+use crate::utils::{ArrangeHandler, Arrangement};
 mod map;
 mod reduce;
 /// The Context for build a Operator with id of `GlobalId`
 pub struct Context<'referred, 'df> {
@@ -94,6 +86,8 @@ impl<'referred, 'df> Context<'referred, 'df> {
    }
 }
 // There is a false positive in using `Vec<ScalarExpr>` as key
 #[allow(clippy::mutable_key_type)]
 impl<'referred, 'df> Context<'referred, 'df> {
    /// Interpret and execute plan
    ///
@@ -103,59 +97,26 @@ impl<'referred, 'df> Context<'referred, 'df> {
            Plan::Constant { rows } => Ok(self.render_constant(rows)),
            Plan::Get { id } => self.get_by_id(id),
            Plan::Let { id, value, body } => self.eval_let(id, value, body),
-            Plan::Mfp { input, mfp } => self.render_mfp(input, mfp),
+            Plan::Mfp { input, mfp } => {
-            Plan::Reduce {
+                self.render_map_filter_project_into_executable_dataflow(input, mfp)
                input,
                key_val_plan,
                reduce_plan,
            } => self.render_reduce(input, key_val_plan, reduce_plan),
            Plan::Join { .. } => NotImplementedSnafu {
                reason: "Join is still WIP".to_string(),
            }
-            .fail(),
+            Plan::Reduce { .. } => todo!(),
-            Plan::Union { .. } => NotImplementedSnafu {
+            Plan::Join { .. } => todo!(),
-                reason: "Union is still WIP".to_string(),
+            Plan::Union { .. } => todo!(),
            }
            .fail(),
        }
    }
-    /// render Constant, take all rows that have a timestamp not greater than the current time
+    /// render Constant, will only emit the `rows` once.
-    ///
+    pub fn render_constant(&mut self, mut rows: Vec<DiffRow>) -> CollectionBundle {
    /// Always assume input is sorted by timestamp
    pub fn render_constant(&mut self, rows: Vec<DiffRow>) -> CollectionBundle {
        let (send_port, recv_port) = self.df.make_edge::<_, Toff>("constant");
        let mut per_time: BTreeMap<repr::Timestamp, Vec<DiffRow>> = rows
            .into_iter()
            .group_by(|(_row, ts, _diff)| *ts)
            .into_iter()
            .map(|(k, v)| (k, v.into_iter().collect_vec()))
            .collect();
        let now = self.compute_state.current_time_ref();
        // TODO(discord9): better way to schedule future run
        let scheduler = self.compute_state.get_scheduler();
        let scheduler_inner = scheduler.clone();
        let subgraph_id =
        self.df
            .add_subgraph_source("Constant", send_port, move |_ctx, send_port| {
-                    // find the first timestamp that is greater than now
+                if rows.is_empty() {
-                    // use filter_map
+                    return;
                    let mut after = per_time.split_off(&(*now.borrow() + 1));
                    // swap
                    std::mem::swap(&mut per_time, &mut after);
                    let not_great_than_now = after;
                    not_great_than_now.into_iter().for_each(|(_ts, rows)| {
                        send_port.give(rows);
                    });
                    // schedule the next run
                    if let Some(next_run_time) = per_time.keys().next().copied() {
                        scheduler_inner.schedule_at(next_run_time);
                }
                send_port.give(std::mem::take(&mut rows));
            });
        scheduler.set_cur_subgraph(subgraph_id);
        CollectionBundle::from_collection(Collection::from_port(recv_port))
    }
@@ -200,14 +161,144 @@ impl<'referred, 'df> Context<'referred, 'df> {
        let ret = self.render_plan(*body)?;
        Ok(ret)
    }
    /// render MapFilterProject, will only emit the `rows` once. Assume all incoming row's sys time being `now`` and ignore the row's stated sys time
    /// TODO(discord9): schedule mfp operator to run when temporal filter need
    ///
    /// `MapFilterProject`(`mfp` for short) is scheduled to run when there is enough amount of input updates
    /// ***or*** when a future update in it's output buffer(a `Arrangement`) is supposed to emit now.
    pub fn render_map_filter_project_into_executable_dataflow(
        &mut self,
        input: Box<Plan>,
        mfp: MapFilterProject,
    ) -> Result<CollectionBundle, Error> {
        let input = self.render_plan(*input)?;
        // TODO(discord9): consider if check if contain temporal to determine if
        // need arrange or not, or does this added complexity worth it
        let (out_send_port, out_recv_port) = self.df.make_edge::<_, Toff>("mfp");
        let input_arity = mfp.input_arity;
        // default to have a arrange with only future updates, so it can be empty if no temporal filter is applied
        // as stream only sends current updates and etc.
        let arrange = Arrangement::new();
        let arrange_handler = ArrangeHandler::from(arrange.clone());
        let arrange_handler_inner = ArrangeHandler::from(arrange);
        // This closure capture following variables:
        let mfp_plan = MfpPlan::create_from(mfp)?;
        let now = self.compute_state.current_time_ref();
        let err_collector = self.err_collector.clone();
        // TODO(discord9): better way to schedule future run
        let scheduler = self.compute_state.get_scheduler();
        let scheduler_inner = scheduler.clone();
        let subgraph = self.df.add_subgraph_in_out(
            "mfp",
            input.collection.into_inner(),
            out_send_port,
            move |_ctx, recv, send| {
                // mfp only need to passively receive updates from recvs
                let data = recv.take_inner().into_iter().flat_map(|v| v.into_iter());
                mfp_subgraph(
                    &arrange_handler_inner,
                    data,
                    &mfp_plan,
                    *now.borrow(),
                    &err_collector,
                    &scheduler_inner,
                    send,
                );
            },
        );
        // register current subgraph in scheduler for future scheduling
        scheduler.set_cur_subgraph(subgraph);
        let arranged = BTreeMap::from([(
            (0..input_arity).map(ScalarExpr::Column).collect_vec(),
            Arranged::new(arrange_handler),
        )]);
        let bundle = CollectionBundle {
            collection: Collection::from_port(out_recv_port),
            arranged,
        };
        Ok(bundle)
    }
 }
-/// The Common argument for all `Subgraph` in the render process
+fn mfp_subgraph(
-struct SubgraphArg<'a> {
+    arrange: &ArrangeHandler,
    input: impl IntoIterator<Item = DiffRow>,
    mfp_plan: &MfpPlan,
    now: repr::Timestamp,
-    err_collector: &'a ErrCollector,
+    err_collector: &ErrCollector,
-    scheduler: &'a Scheduler,
+    scheduler: &Scheduler,
-    send: &'a PortCtx<SEND, Toff>,
+    send: &PortCtx<SEND, Toff>,
 ) {
    let run_mfp = || {
        let all_updates = eval_mfp_core(input, mfp_plan, now, err_collector);
        arrange.write().apply_updates(now, all_updates)?;
        Ok(())
    };
    err_collector.run(run_mfp);
    // Deal with output:
    // 1. Read all updates that were emitted between the last time this arrangement had updates and the current time.
    // 2. Output the updates.
    // 3. Truncate all updates within that range.
    let from = arrange.read().last_compaction_time().map(|n| n + 1);
    let from = from.unwrap_or(repr::Timestamp::MIN);
    let output_kv = arrange.read().get_updates_in_range(from..=now);
    // the output is expected to be key -> empty val
    let output = output_kv
        .into_iter()
        .map(|((key, _v), ts, diff)| (key, ts, diff))
        .collect_vec();
    send.give(output);
    let run_compaction = || {
        arrange.write().compaction_to(now)?;
        Ok(())
    };
    err_collector.run(run_compaction);
    // schedule the next time this operator should run
    if let Some(i) = arrange.read().get_next_update_time(&now) {
        scheduler.schedule_at(i)
    }
 }
 /// The core of evaluating MFP operator, given a MFP and a input, evaluate the MFP operator,
 /// return the output updates **And** possibly any number of errors that occurred during the evaluation
 fn eval_mfp_core(
    input: impl IntoIterator<Item = DiffRow>,
    mfp_plan: &MfpPlan,
    now: repr::Timestamp,
    err_collector: &ErrCollector,
 ) -> Vec<KeyValDiffRow> {
    let mut all_updates = Vec::new();
    for (mut row, _sys_time, diff) in input.into_iter() {
        // this updates is expected to be only zero to two rows
        let updates = mfp_plan.evaluate::<EvalError>(&mut row.inner, now, diff);
        // TODO(discord9): refactor error handling
        // Expect error in a single row to not interrupt the whole evaluation
        let updates = updates
            .filter_map(|r| match r {
                Ok((key, ts, diff)) => Some(((key, Row::empty()), ts, diff)),
                Err((err, _ts, _diff)) => {
                    err_collector.push_err(err);
                    None
                }
            })
            .collect_vec();
        all_updates.extend(updates);
    }
    all_updates
 }
 #[cfg(test)]
@@ -225,30 +316,64 @@ mod test {
    use crate::expr::BinaryFunc;
    use crate::repr::Row;
-    pub fn run_and_check(
+    fn harness_test_ctx<'r, 'h>(
-        state: &mut DataflowState,
+        df: &'r mut Hydroflow<'h>,
-        df: &mut Hydroflow,
+        state: &'r mut DataflowState,
-        time_range: Range<i64>,
+    ) -> Context<'r, 'h> {
-        expected: BTreeMap<i64, Vec<DiffRow>>,
+        let err_collector = state.get_err_collector();
-        output: Rc<RefCell<Vec<DiffRow>>>,
+        Context {
-    ) {
+            id: GlobalId::User(0),
-        for now in time_range {
+            df,
-            state.set_current_ts(now);
+            compute_state: state,
-            state.run_available_with_schedule(df);
+            input_collection: BTreeMap::new(),
-            assert!(state.get_err_collector().inner.borrow().is_empty());
+            local_scope: Default::default(),
-            if let Some(expected) = expected.get(&now) {
+            err_collector,
                assert_eq!(*output.borrow(), *expected, "at ts={}", now);
            } else {
                assert_eq!(*output.borrow(), vec![], "at ts={}", now);
            };
            output.borrow_mut().clear();
        }
    }
-    pub fn get_output_handle(
+    /// test if temporal filter works properly
-        ctx: &mut Context,
+    /// namely: if mfp operator can schedule a delete at the correct time
-        mut bundle: CollectionBundle,
+    #[test]
-    ) -> Rc<RefCell<Vec<DiffRow>>> {
+    fn test_render_mfp_with_temporal() {
        let mut df = Hydroflow::new();
        let mut state = DataflowState::default();
        let mut ctx = harness_test_ctx(&mut df, &mut state);
        let rows = vec![
            (Row::new(vec![1i64.into()]), 1, 1),
            (Row::new(vec![2i64.into()]), 2, 1),
            (Row::new(vec![3i64.into()]), 3, 1),
        ];
        let collection = ctx.render_constant(rows);
        ctx.insert_global(GlobalId::User(1), collection);
        let input_plan = Plan::Get {
            id: expr::Id::Global(GlobalId::User(1)),
        };
        // temporal filter: now <= col(0) < now + 4
        let mfp = MapFilterProject::new(1)
            .filter(vec![
                ScalarExpr::Column(0)
                    .call_unary(expr::UnaryFunc::Cast(ConcreteDataType::datetime_datatype()))
                    .call_binary(
                        ScalarExpr::CallUnmaterializable(expr::UnmaterializableFunc::Now),
                        BinaryFunc::Gte,
                    ),
                ScalarExpr::Column(0)
                    .call_binary(
                        ScalarExpr::literal(4i64.into(), ConcreteDataType::int64_datatype()),
                        BinaryFunc::SubInt64,
                    )
                    .call_unary(expr::UnaryFunc::Cast(ConcreteDataType::datetime_datatype()))
                    .call_binary(
                        ScalarExpr::CallUnmaterializable(expr::UnmaterializableFunc::Now),
                        BinaryFunc::Lt,
                    ),
            ])
            .unwrap();
        let mut bundle = ctx
            .render_map_filter_project_into_executable_dataflow(Box::new(input_plan), mfp)
            .unwrap();
        let collection = bundle.collection;
        let _arranged = bundle.arranged.pop_first().unwrap().1;
        let output = Rc::new(RefCell::new(vec![]));
@@ -263,22 +388,93 @@ mod test {
                output_inner.borrow_mut().extend(res);
            },
        );
-        output
+        // drop ctx here to simulate actual process of compile first, run later scenario
        drop(ctx);
        // expected output at given time
        let expected_output = BTreeMap::from([
            (
                0, // time
                vec![
                    (Row::new(vec![1i64.into()]), 0, 1),
                    (Row::new(vec![2i64.into()]), 0, 1),
                    (Row::new(vec![3i64.into()]), 0, 1),
                ],
            ),
            (
                2, // time
                vec![(Row::new(vec![1i64.into()]), 2, -1)],
            ),
            (
                3, // time
                vec![(Row::new(vec![2i64.into()]), 3, -1)],
            ),
            (
                4, // time
                vec![(Row::new(vec![3i64.into()]), 4, -1)],
            ),
        ]);
        for now in 0i64..5 {
            state.set_current_ts(now);
            state.run_available_with_schedule(&mut df);
            assert!(state.get_err_collector().inner.borrow().is_empty());
            if let Some(expected) = expected_output.get(&now) {
                assert_eq!(*output.borrow(), *expected);
            } else {
                assert_eq!(*output.borrow(), vec![]);
            };
            output.borrow_mut().clear();
        }
    }
-    pub fn harness_test_ctx<'r, 'h>(
+    /// test if mfp operator without temporal filter works properly
-        df: &'r mut Hydroflow<'h>,
+    /// that is it filter the rows correctly
-        state: &'r mut DataflowState,
+    #[test]
-    ) -> Context<'r, 'h> {
+    fn test_render_mfp() {
-        let err_collector = state.get_err_collector();
+        let mut df = Hydroflow::new();
-        Context {
+        let mut state = DataflowState::default();
-            id: GlobalId::User(0),
+        let mut ctx = harness_test_ctx(&mut df, &mut state);
-            df,
+
-            compute_state: state,
+        let rows = vec![
-            input_collection: BTreeMap::new(),
+            (Row::new(vec![1.into()]), 1, 1),
-            local_scope: Default::default(),
+            (Row::new(vec![2.into()]), 2, 1),
-            err_collector,
+            (Row::new(vec![3.into()]), 3, 1),
-        }
+        ];
        let collection = ctx.render_constant(rows);
        ctx.insert_global(GlobalId::User(1), collection);
        let input_plan = Plan::Get {
            id: expr::Id::Global(GlobalId::User(1)),
        };
        // filter: col(0)>1
        let mfp = MapFilterProject::new(1)
            .filter(vec![ScalarExpr::Column(0).call_binary(
                ScalarExpr::literal(1.into(), ConcreteDataType::int32_datatype()),
                BinaryFunc::Gt,
            )])
            .unwrap();
        let bundle = ctx
            .render_map_filter_project_into_executable_dataflow(Box::new(input_plan), mfp)
            .unwrap();
        let collection = bundle.collection.clone(ctx.df);
        ctx.df.add_subgraph_sink(
            "test_render_constant",
            collection.into_inner(),
            move |_ctx, recv| {
                let data = recv.take_inner();
                let res = data.into_iter().flat_map(|v| v.into_iter()).collect_vec();
                assert_eq!(
                    res,
                    vec![
                        (Row::new(vec![2.into()]), 0, 1),
                        (Row::new(vec![3.into()]), 0, 1),
                    ]
                )
            },
        );
        drop(ctx);
        df.run_available();
    }
    /// test if constant operator works properly
@@ -298,7 +494,7 @@ mod test {
        let collection = collection.collection.clone(ctx.df);
        let cnt = Rc::new(RefCell::new(0));
        let cnt_inner = cnt.clone();
-        let res_subgraph_id = ctx.df.add_subgraph_sink(
+        ctx.df.add_subgraph_sink(
            "test_render_constant",
            collection.into_inner(),
            move |_ctx, recv| {
@@ -306,16 +502,9 @@ mod test {
                *cnt_inner.borrow_mut() += data.iter().map(|v| v.len()).sum::<usize>();
            },
        );
        ctx.compute_state.set_current_ts(2);
        ctx.compute_state.run_available_with_schedule(ctx.df);
        assert_eq!(*cnt.borrow(), 2);
        ctx.compute_state.set_current_ts(3);
        ctx.compute_state.run_available_with_schedule(ctx.df);
        // to get output
        ctx.df.schedule_subgraph(res_subgraph_id);
        ctx.df.run_available();
-
+        assert_eq!(*cnt.borrow(), 3);
        ctx.df.run_available();
        assert_eq!(*cnt.borrow(), 3);
    }
@@ -344,33 +533,4 @@ mod test {
        assert_eq!(sum.borrow().to_owned(), 45);
    }
    #[test]
    fn test_tee_auto_schedule() {
        use hydroflow::scheduled::handoff::TeeingHandoff as Toff;
        let mut df = Hydroflow::new();
        let (send_port, recv_port) = df.make_edge::<_, Toff<i32>>("test_handoff");
        let source = df.add_subgraph_source("test_handoff_source", send_port, move |_ctx, send| {
            for i in 0..10 {
                send.give(vec![i]);
            }
        });
        let teed_recv_port = recv_port.tee(&mut df);
        let sum = Rc::new(RefCell::new(0));
        let sum_move = sum.clone();
        let _sink = df.add_subgraph_sink("test_handoff_sink", teed_recv_port, move |_ctx, recv| {
            let data = recv.take_inner();
            *sum_move.borrow_mut() += data.iter().flat_map(|i| i.iter()).sum::<i32>();
        });
        drop(recv_port);
        df.run_available();
        assert_eq!(sum.borrow().to_owned(), 45);
        df.schedule_subgraph(source);
        df.run_available();
        assert_eq!(sum.borrow().to_owned(), 90);
    }
 }
--- a/src/flow/src/compute/render/map.rs
+++ b/src/flow/src/compute/render/map.rs
@@ -1,293 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 use std::collections::BTreeMap;
 use hydroflow::scheduled::graph_ext::GraphExt;
 use hydroflow::scheduled::port::{PortCtx, SEND};
 use itertools::Itertools;
 use snafu::OptionExt;
 use crate::adapter::error::{Error, PlanSnafu};
 use crate::compute::render::Context;
 use crate::compute::state::Scheduler;
 use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff};
 use crate::expr::{EvalError, MapFilterProject, MfpPlan, ScalarExpr};
 use crate::plan::Plan;
 use crate::repr::{self, DiffRow, KeyValDiffRow, Row};
 use crate::utils::ArrangeHandler;
 impl<'referred, 'df> Context<'referred, 'df> {
    /// render MapFilterProject, will only emit the `rows` once. Assume all incoming row's sys time being `now`` and ignore the row's stated sys time
    /// TODO(discord9): schedule mfp operator to run when temporal filter need
    ///
    /// `MapFilterProject`(`mfp` for short) is scheduled to run when there is enough amount of input updates
    /// ***or*** when a future update in it's output buffer(a `Arrangement`) is supposed to emit now.
    // There is a false positive in using `Vec<ScalarExpr>` as key due to `Value` have `bytes` variant
    #[allow(clippy::mutable_key_type)]
    pub fn render_mfp(
        &mut self,
        input: Box<Plan>,
        mfp: MapFilterProject,
    ) -> Result<CollectionBundle, Error> {
        let input = self.render_plan(*input)?;
        // TODO(discord9): consider if check if contain temporal to determine if
        // need arrange or not, or does this added complexity worth it
        let (out_send_port, out_recv_port) = self.df.make_edge::<_, Toff>("mfp");
        let output_arity = mfp.output_arity();
        // default to have a arrange with only future updates, so it can be empty if no temporal filter is applied
        // as stream only sends current updates and etc.
        let arrange_handler = self.compute_state.new_arrange(None);
        let arrange_handler_inner =
            arrange_handler
                .clone_future_only()
                .with_context(|| PlanSnafu {
                    reason: "No write is expected at this point",
                })?;
        // This closure capture following variables:
        let mfp_plan = MfpPlan::create_from(mfp)?;
        let now = self.compute_state.current_time_ref();
        let err_collector = self.err_collector.clone();
        // TODO(discord9): better way to schedule future run
        let scheduler = self.compute_state.get_scheduler();
        let scheduler_inner = scheduler.clone();
        let subgraph = self.df.add_subgraph_in_out(
            "mfp",
            input.collection.into_inner(),
            out_send_port,
            move |_ctx, recv, send| {
                // mfp only need to passively receive updates from recvs
                let data = recv.take_inner().into_iter().flat_map(|v| v.into_iter());
                mfp_subgraph(
                    &arrange_handler_inner,
                    data,
                    &mfp_plan,
                    *now.borrow(),
                    &err_collector,
                    &scheduler_inner,
                    send,
                );
            },
        );
        // register current subgraph in scheduler for future scheduling
        scheduler.set_cur_subgraph(subgraph);
        let arranged = BTreeMap::from([(
            (0..output_arity).map(ScalarExpr::Column).collect_vec(),
            Arranged::new(arrange_handler),
        )]);
        let bundle = CollectionBundle {
            collection: Collection::from_port(out_recv_port),
            arranged,
        };
        Ok(bundle)
    }
 }
 fn mfp_subgraph(
    arrange: &ArrangeHandler,
    input: impl IntoIterator<Item = DiffRow>,
    mfp_plan: &MfpPlan,
    now: repr::Timestamp,
    err_collector: &ErrCollector,
    scheduler: &Scheduler,
    send: &PortCtx<SEND, Toff>,
 ) {
    let run_mfp = || {
        let all_updates = eval_mfp_core(input, mfp_plan, now, err_collector);
        arrange.write().apply_updates(now, all_updates)?;
        Ok(())
    };
    err_collector.run(run_mfp);
    // Deal with output:
    // 1. Read all updates that were emitted between the last time this arrangement had updates and the current time.
    // 2. Output the updates.
    // 3. Truncate all updates within that range.
    let from = arrange.read().last_compaction_time().map(|n| n + 1);
    let from = from.unwrap_or(repr::Timestamp::MIN);
    let output_kv = arrange.read().get_updates_in_range(from..=now);
    // the output is expected to be key -> empty val
    let output = output_kv
        .into_iter()
        .map(|((key, _v), ts, diff)| (key, ts, diff))
        .collect_vec();
    send.give(output);
    let run_compaction = || {
        arrange.write().compact_to(now)?;
        Ok(())
    };
    err_collector.run(run_compaction);
    // schedule next time this subgraph should run
    scheduler.schedule_for_arrange(&arrange.read(), now);
 }
 /// The core of evaluating MFP operator, given a MFP and a input, evaluate the MFP operator,
 /// return the output updates **And** possibly any number of errors that occurred during the evaluation
 fn eval_mfp_core(
    input: impl IntoIterator<Item = DiffRow>,
    mfp_plan: &MfpPlan,
    now: repr::Timestamp,
    err_collector: &ErrCollector,
 ) -> Vec<KeyValDiffRow> {
    let mut all_updates = Vec::new();
    for (mut row, _sys_time, diff) in input.into_iter() {
        // this updates is expected to be only zero to two rows
        let updates = mfp_plan.evaluate::<EvalError>(&mut row.inner, now, diff);
        // TODO(discord9): refactor error handling
        // Expect error in a single row to not interrupt the whole evaluation
        let updates = updates
            .filter_map(|r| match r {
                Ok((key, ts, diff)) => Some(((key, Row::empty()), ts, diff)),
                Err((err, _ts, _diff)) => {
                    err_collector.push_err(err);
                    None
                }
            })
            .collect_vec();
        all_updates.extend(updates);
    }
    all_updates
 }
 #[cfg(test)]
 mod test {
    use std::cell::RefCell;
    use std::rc::Rc;
    use datatypes::data_type::ConcreteDataType;
    use hydroflow::scheduled::graph::Hydroflow;
    use super::*;
    use crate::compute::render::test::{get_output_handle, harness_test_ctx, run_and_check};
    use crate::compute::state::DataflowState;
    use crate::expr::{self, BinaryFunc, GlobalId};
    /// test if temporal filter works properly
    /// namely: if mfp operator can schedule a delete at the correct time
    #[test]
    fn test_render_mfp_with_temporal() {
        let mut df = Hydroflow::new();
        let mut state = DataflowState::default();
        let mut ctx = harness_test_ctx(&mut df, &mut state);
        let rows = vec![
            (Row::new(vec![1i64.into()]), 0, 1),
            (Row::new(vec![2i64.into()]), 0, 1),
            (Row::new(vec![3i64.into()]), 0, 1),
        ];
        let collection = ctx.render_constant(rows.clone());
        ctx.insert_global(GlobalId::User(1), collection);
        let input_plan = Plan::Get {
            id: expr::Id::Global(GlobalId::User(1)),
        };
        // temporal filter: now <= col(0) < now + 4
        let mfp = MapFilterProject::new(1)
            .filter(vec![
                ScalarExpr::Column(0)
                    .call_unary(expr::UnaryFunc::Cast(ConcreteDataType::datetime_datatype()))
                    .call_binary(
                        ScalarExpr::CallUnmaterializable(expr::UnmaterializableFunc::Now),
                        BinaryFunc::Gte,
                    ),
                ScalarExpr::Column(0)
                    .call_binary(
                        ScalarExpr::literal(4i64.into(), ConcreteDataType::int64_datatype()),
                        BinaryFunc::SubInt64,
                    )
                    .call_unary(expr::UnaryFunc::Cast(ConcreteDataType::datetime_datatype()))
                    .call_binary(
                        ScalarExpr::CallUnmaterializable(expr::UnmaterializableFunc::Now),
                        BinaryFunc::Lt,
                    ),
            ])
            .unwrap();
        let bundle = ctx.render_mfp(Box::new(input_plan), mfp).unwrap();
        let output = get_output_handle(&mut ctx, bundle);
        // drop ctx here to simulate actual process of compile first, run later scenario
        drop(ctx);
        // expected output at given time
        let expected_output = BTreeMap::from([
            (
                0, // time
                vec![
                    (Row::new(vec![1i64.into()]), 0, 1),
                    (Row::new(vec![2i64.into()]), 0, 1),
                    (Row::new(vec![3i64.into()]), 0, 1),
                ],
            ),
            (
                2, // time
                vec![(Row::new(vec![1i64.into()]), 2, -1)],
            ),
            (
                3, // time
                vec![(Row::new(vec![2i64.into()]), 3, -1)],
            ),
            (
                4, // time
                vec![(Row::new(vec![3i64.into()]), 4, -1)],
            ),
        ]);
        run_and_check(&mut state, &mut df, 0..5, expected_output, output);
    }
    /// test if mfp operator without temporal filter works properly
    /// that is it filter the rows correctly
    #[test]
    fn test_render_mfp() {
        let mut df = Hydroflow::new();
        let mut state = DataflowState::default();
        let mut ctx = harness_test_ctx(&mut df, &mut state);
        let rows = vec![
            (Row::new(vec![1.into()]), 1, 1),
            (Row::new(vec![2.into()]), 2, 1),
            (Row::new(vec![3.into()]), 3, 1),
        ];
        let collection = ctx.render_constant(rows.clone());
        ctx.insert_global(GlobalId::User(1), collection);
        let input_plan = Plan::Get {
            id: expr::Id::Global(GlobalId::User(1)),
        };
        // filter: col(0)>1
        let mfp = MapFilterProject::new(1)
            .filter(vec![ScalarExpr::Column(0).call_binary(
                ScalarExpr::literal(1.into(), ConcreteDataType::int32_datatype()),
                BinaryFunc::Gt,
            )])
            .unwrap();
        let bundle = ctx.render_mfp(Box::new(input_plan), mfp).unwrap();
        let output = get_output_handle(&mut ctx, bundle);
        drop(ctx);
        let expected = BTreeMap::from([
            (2, vec![(Row::new(vec![2.into()]), 2, 1)]),
            (3, vec![(Row::new(vec![3.into()]), 3, 1)]),
        ]);
        run_and_check(&mut state, &mut df, 1..5, expected, output);
    }
 }
--- a/src/flow/src/compute/render/reduce.rs
+++ b/src/flow/src/compute/render/reduce.rs
--- a/src/flow/src/compute/state.rs
+++ b/src/flow/src/compute/state.rs
@@ -21,7 +21,6 @@ use hydroflow::scheduled::SubgraphId;
 use crate::compute::types::ErrCollector;
 use crate::repr::{self, Timestamp};
 use crate::utils::{ArrangeHandler, Arrangement};
 /// input/output of a dataflow
 /// One `ComputeState` manage the input/output/schedule of one `Hydroflow`
@@ -39,24 +38,9 @@ pub struct DataflowState {
    /// error collector local to this `ComputeState`,
    /// useful for distinguishing errors from different `Hydroflow`
    err_collector: ErrCollector,
    /// save all used arrange in this dataflow, since usually there is no delete operation
    /// we can just keep track of all used arrange and schedule subgraph when they need to be updated
    arrange_used: Vec<ArrangeHandler>,
 }
 impl DataflowState {
    pub fn new_arrange(&mut self, name: Option<Vec<String>>) -> ArrangeHandler {
        let arrange = name.map(Arrangement::new_with_name).unwrap_or_default();
        let arr = ArrangeHandler::from(arrange);
        // mark this arrange as used in this dataflow
        self.arrange_used.push(
            arr.clone_future_only()
                .expect("No write happening at this point"),
        );
        arr
    }
    /// schedule all subgraph that need to run with time <= `as_of` and run_available()
    ///
    /// return true if any subgraph actually executed
@@ -101,9 +85,8 @@ impl DataflowState {
    }
 }
-#[derive(Debug, Clone)]
+#[derive(Clone)]
 pub struct Scheduler {
    // this scheduler is shared with `DataflowState`, so it can schedule subgraph
    schedule_subgraph: Rc<RefCell<BTreeMap<Timestamp, VecDeque<SubgraphId>>>>,
    cur_subgraph: Rc<RefCell<Option<SubgraphId>>>,
 }
@@ -117,12 +100,6 @@ impl Scheduler {
        subgraph_queue.push_back(*subgraph);
    }
    pub fn schedule_for_arrange(&self, arrange: &Arrangement, now: Timestamp) {
        if let Some(i) = arrange.get_next_update_time(&now) {
            self.schedule_at(i)
        }
    }
    pub fn set_cur_subgraph(&self, subgraph: SubgraphId) {
        self.cur_subgraph.replace(Some(subgraph));
    }
--- a/src/flow/src/compute/types.rs
+++ b/src/flow/src/compute/types.rs
@@ -28,7 +28,7 @@ use crate::expr::{EvalError, ScalarExpr};
 use crate::repr::DiffRow;
 use crate::utils::{ArrangeHandler, Arrangement};
-pub type Toff<T = DiffRow> = TeeingHandoff<T>;
+pub type Toff = TeeingHandoff<DiffRow>;
 /// A collection, represent a collections of data that is received from a handoff.
 pub struct Collection<T: 'static> {
@@ -107,17 +107,12 @@ impl Arranged {
 /// of reading the data from the collection.
 pub struct CollectionBundle {
    /// This is useful for passively reading the new updates from the collection
    ///
    /// Invariant: the timestamp of the updates should always not greater than now, since future updates should be stored in the arrangement
    pub collection: Collection<DiffRow>,
    /// the key [`ScalarExpr`] indicate how the keys(also a [`Row`]) used in Arranged is extract from collection's [`Row`]
    /// So it is the "index" of the arrangement
    ///
    /// The `Arranged` is the actual data source, it can be used to read the data from the collection by
    /// using the key indicated by the `Vec<ScalarExpr>`
    /// There is a false positive in using `Vec<ScalarExpr>` as key due to `ScalarExpr::Literal`
    /// contain a `Value` which have `bytes` variant
    #[allow(clippy::mutable_key_type)]
    pub arranged: BTreeMap<Vec<ScalarExpr>, Arranged>,
 }
@@ -156,16 +151,12 @@ impl ErrCollector {
        self.inner.borrow_mut().push_back(err)
    }
-    pub fn run<F, R>(&self, f: F) -> Option<R>
+    pub fn run<F>(&self, f: F)
    where
-        F: FnOnce() -> Result<R, EvalError>,
+        F: FnOnce() -> Result<(), EvalError>,
    {
-        match f() {
+        if let Err(e) = f() {
-            Ok(r) => Some(r),
+            self.push_err(e)
            Err(e) => {
                self.push_err(e);
                None
            }
        }
    }
 }
--- a/src/flow/src/expr/error.rs
+++ b/src/flow/src/expr/error.rs
@@ -52,13 +52,6 @@ pub enum EvalError {
        location: Location,
    },
    #[snafu(display("{msg}"))]
    DataType {
        msg: String,
        source: datatypes::Error,
        location: Location,
    },
    #[snafu(display("Invalid argument: {reason}"))]
    InvalidArgument { reason: String, location: Location },
--- a/src/flow/src/expr/linear.rs
+++ b/src/flow/src/expr/linear.rs
@@ -89,11 +89,6 @@ impl MapFilterProject {
        }
    }
    /// The number of columns expected in the output row.
    pub fn output_arity(&self) -> usize {
        self.projection.len()
    }
    /// Given two mfps, return an mfp that applies one
    /// followed by the other.
    /// Note that the arguments are in the opposite order
--- a/src/flow/src/expr/relation/accum.rs
+++ b/src/flow/src/expr/relation/accum.rs
@@ -18,9 +18,7 @@
 //! So the overhead is acceptable.
 //!
 //! Currently support sum, count, any, all and min/max(with one caveat that min/max can't support delete with aggregate).
 //! TODO: think of better ways to not ser/de every time a accum needed to be updated, since it's in a tight loop
 use std::any::type_name;
 use std::fmt::Display;
 use common_decimal::Decimal128;
@@ -41,7 +39,6 @@ use crate::repr::Diff;
 #[enum_dispatch]
 pub trait Accumulator: Sized {
    fn into_state(self) -> Vec<Value>;
    fn update(
        &mut self,
        aggr_fn: &AggregateFunc,
@@ -71,21 +68,6 @@ pub struct Bool {
    falses: Diff,
 }
 impl Bool {
    /// Expect two `Diff` type values, one for `true` and one for `false`.
    pub fn try_from_iter<I>(iter: &mut I) -> Result<Self, EvalError>
    where
        I: Iterator<Item = Value>,
    {
        Ok(Self {
            trues: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
            falses: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
        })
    }
 }
 impl TryFrom<Vec<Value>> for Bool {
    type Error = EvalError;
@@ -96,9 +78,13 @@ impl TryFrom<Vec<Value>> for Bool {
                reason: "Bool Accumulator state should have 2 values",
            }
        );
        let mut iter = state.into_iter();
-        Self::try_from_iter(&mut iter)
+        Ok(Self {
            trues: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
            falses: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
        })
    }
 }
@@ -171,24 +157,6 @@ pub struct SimpleNumber {
    non_nulls: Diff,
 }
 impl SimpleNumber {
    /// Expect one `Decimal128` and one `Diff` type values.
    /// The `Decimal128` type is used to store the sum of all non-NULL values.
    /// The `Diff` type is used to count the number of non-NULL values.
    pub fn try_from_iter<I>(iter: &mut I) -> Result<Self, EvalError>
    where
        I: Iterator<Item = Value>,
    {
        Ok(Self {
            accum: Decimal128::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?
                .val(),
            non_nulls: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
        })
    }
 }
 impl TryFrom<Vec<Value>> for SimpleNumber {
    type Error = EvalError;
@@ -200,7 +168,13 @@ impl TryFrom<Vec<Value>> for SimpleNumber {
            }
        );
        let mut iter = state.into_iter();
-        Self::try_from_iter(&mut iter)
+
        Ok(Self {
            accum: Decimal128::try_from(iter.next().unwrap())
                .map_err(err_try_from_val)?
                .val(),
            non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
        })
    }
 }
@@ -298,34 +272,6 @@ pub struct Float {
    non_nulls: Diff,
 }
 impl Float {
    /// Expect first value to be `OrderedF64` and the rest four values to be `Diff` type values.
    pub fn try_from_iter<I>(iter: &mut I) -> Result<Self, EvalError>
    where
        I: Iterator<Item = Value>,
    {
        let mut ret = Self {
            accum: OrderedF64::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
            pos_infs: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
            neg_infs: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
            nans: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
            non_nulls: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
        };
        // This prevent counter-intuitive behavior of summing over no values having non-zero results
        if ret.non_nulls == 0 {
            ret.accum = OrderedFloat::from(0.0);
        }
        Ok(ret)
    }
 }
 impl TryFrom<Vec<Value>> for Float {
    type Error = EvalError;
@@ -439,26 +385,6 @@ pub struct OrdValue {
    non_nulls: Diff,
 }
 impl OrdValue {
    pub fn try_from_iter<I>(iter: &mut I) -> Result<Self, EvalError>
    where
        I: Iterator<Item = Value>,
    {
        Ok(Self {
            val: {
                let v = iter.next().ok_or_else(fail_accum::<Self>)?;
                if v == Value::Null {
                    None
                } else {
                    Some(v)
                }
            },
            non_nulls: Diff::try_from(iter.next().ok_or_else(fail_accum::<Self>)?)
                .map_err(err_try_from_val)?,
        })
    }
 }
 impl TryFrom<Vec<Value>> for OrdValue {
    type Error = EvalError;
@@ -667,37 +593,6 @@ impl Accum {
        })
    }
    pub fn try_from_iter(
        aggr_fn: &AggregateFunc,
        iter: &mut impl Iterator<Item = Value>,
    ) -> Result<Self, EvalError> {
        match aggr_fn {
            AggregateFunc::Any
            | AggregateFunc::All
            | AggregateFunc::MaxBool
            | AggregateFunc::MinBool => Ok(Self::from(Bool::try_from_iter(iter)?)),
            AggregateFunc::SumInt16
            | AggregateFunc::SumInt32
            | AggregateFunc::SumInt64
            | AggregateFunc::SumUInt16
            | AggregateFunc::SumUInt32
            | AggregateFunc::SumUInt64 => Ok(Self::from(SimpleNumber::try_from_iter(iter)?)),
            AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => {
                Ok(Self::from(Float::try_from_iter(iter)?))
            }
            f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
                Ok(Self::from(OrdValue::try_from_iter(iter)?))
            }
            f => Err(InternalSnafu {
                reason: format!(
                    "Accumulator does not support this aggregation function: {:?}",
                    f
                ),
            }
            .build()),
        }
    }
    /// try to convert a vector of value into given aggregate function's accumulator
    pub fn try_into_accum(aggr_fn: &AggregateFunc, state: Vec<Value>) -> Result<Self, EvalError> {
        match aggr_fn {
@@ -728,16 +623,6 @@ impl Accum {
    }
 }
 fn fail_accum<T>() -> EvalError {
    InternalSnafu {
        reason: format!(
            "list of values exhausted before a accum of type {} can be build from it",
            type_name::<T>()
        ),
    }
    .build()
 }
 fn err_try_from_val<T: Display>(reason: T) -> EvalError {
    TryFromValueSnafu {
        msg: reason.to_string(),
@@ -890,9 +775,7 @@ mod test {
                let mut acc = Accum::new_accum(&aggr_fn)?;
                acc.update_batch(&aggr_fn, input.clone())?;
                let row = acc.into_state();
-                let acc = Accum::try_into_accum(&aggr_fn, row.clone())?;
+                let acc = Accum::try_into_accum(&aggr_fn, row)?;
                let alter_acc = Accum::try_from_iter(&aggr_fn, &mut row.into_iter())?;
                assert_eq!(acc, alter_acc);
                Ok(acc)
            };
            let acc = match create_and_insert() {
--- a/src/flow/src/expr/relation/func.rs
+++ b/src/flow/src/expr/relation/func.rs
@@ -112,21 +112,18 @@ impl AggregateFunc {
    /// Expect self to be accumulable aggregate function, i.e. sum/count
    ///
    /// TODO(discord9): deal with overflow&better accumulator
-    pub fn eval_diff_accumulable<A, I>(
+    pub fn eval_diff_accumulable<I>(
        &self,
-        accum: A,
+        accum: Vec<Value>,
        value_diffs: I,
    ) -> Result<(Value, Vec<Value>), EvalError>
    where
        A: IntoIterator<Item = Value>,
        I: IntoIterator<Item = (Value, Diff)>,
    {
-        let mut accum = accum.into_iter().peekable();
+        let mut accum = if accum.is_empty() {
        let mut accum = if accum.peek().is_none() {
            Accum::new_accum(self)?
        } else {
-            Accum::try_from_iter(self, &mut accum)?
+            Accum::try_into_accum(self, accum)?
        };
        accum.update_batch(self, value_diffs)?;
        let res = accum.eval(self)?;
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -16,7 +16,6 @@
 //! It can transform substrait plan into it's own plan and execute it.
 //! It also contains definition of expression, adapter and plan, and internal state management.
 #![feature(let_chains)]
 #![allow(dead_code)]
 #![allow(unused_imports)]
 #![warn(missing_docs)]
--- a/src/flow/src/plan.rs
+++ b/src/flow/src/plan.rs
@@ -21,12 +21,12 @@ mod reduce;
 use datatypes::arrow::ipc::Map;
 use serde::{Deserialize, Serialize};
 pub(crate) use self::reduce::{AccumulablePlan, KeyValPlan, ReducePlan};
 use crate::adapter::error::Error;
 use crate::expr::{
    AggregateExpr, EvalError, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr, TypedExpr,
 };
 use crate::plan::join::JoinPlan;
 pub(crate) use crate::plan::reduce::{AccumulablePlan, AggrWithIndex, KeyValPlan, ReducePlan};
 use crate::repr::{ColumnType, DiffRow, RelationType};
 /// A plan for a dataflow component. But with type to indicate the output type of the relation.
--- a/src/flow/src/plan/reduce.rs
+++ b/src/flow/src/plan/reduce.rs
@@ -47,33 +47,7 @@ pub struct AccumulablePlan {
    /// Each element represents:
    /// (index of aggr output, index of value among inputs, aggr expr)
    /// These will all be rendered together in one dataflow fragment.
-    ///
+    pub simple_aggrs: Vec<(usize, usize, AggregateExpr)>,
-    /// Invariant: the output index is the index of the aggregation in `full_aggrs`
+    /// Same as above but for all of the `DISTINCT` accumulable aggregations.
-    /// which means output index is always smaller than the length of `full_aggrs`
+    pub distinct_aggrs: Vec<(usize, usize, AggregateExpr)>,
    pub simple_aggrs: Vec<AggrWithIndex>,
    /// Same as `simple_aggrs` but for all of the `DISTINCT` accumulable aggregations.
    pub distinct_aggrs: Vec<AggrWithIndex>,
 }
 /// Invariant: the output index is the index of the aggregation in `full_aggrs`
 /// which means output index is always smaller than the length of `full_aggrs`
 #[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
 pub struct AggrWithIndex {
    /// aggregation expression
    pub expr: AggregateExpr,
    /// index of aggr input among input row
    pub input_idx: usize,
    /// index of aggr output among output row
    pub output_idx: usize,
 }
 impl AggrWithIndex {
    /// Create a new `AggrWithIndex`
    pub fn new(expr: AggregateExpr, input_idx: usize, output_idx: usize) -> Self {
        Self {
            expr,
            input_idx,
            output_idx,
        }
    }
 }
--- a/src/flow/src/repr.rs
+++ b/src/flow/src/repr.rs
@@ -160,7 +160,7 @@ impl Row {
        self.inner.iter()
    }
-    /// Returns the number of elements in the row, also known as its 'length'.
+    /// eturns the number of elements in the row, also known as its 'length'.
    pub fn len(&self) -> usize {
        self.inner.len()
    }
--- a/src/flow/src/transform/aggr.rs
+++ b/src/flow/src/transform/aggr.rs
@@ -52,7 +52,7 @@ use crate::expr::{
    AggregateExpr, AggregateFunc, BinaryFunc, GlobalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
    TypedExpr, UnaryFunc, UnmaterializableFunc, VariadicFunc,
 };
-use crate::plan::{AccumulablePlan, AggrWithIndex, KeyValPlan, Plan, ReducePlan, TypedPlan};
+use crate::plan::{AccumulablePlan, KeyValPlan, Plan, ReducePlan, TypedPlan};
 use crate::repr::{self, ColumnType, RelationType};
 use crate::transform::{DataflowContext, FunctionExtensions};
@@ -265,17 +265,9 @@ impl TypedPlan {
                reason: "Expect aggregate argument to be transformed into a column at this point",
            })?;
            if aggr_expr.distinct {
-                distinct_aggrs.push(AggrWithIndex::new(
+                distinct_aggrs.push((output_column, input_column, aggr_expr.clone()));
                    aggr_expr.clone(),
                    input_column,
                    output_column,
                ));
            } else {
-                simple_aggrs.push(AggrWithIndex::new(
+                simple_aggrs.push((output_column, input_column, aggr_expr.clone()));
                    aggr_expr.clone(),
                    input_column,
                    output_column,
                ));
            }
        }
        let accum_plan = AccumulablePlan {
@@ -335,7 +327,7 @@ mod test {
                    },
                    reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
                        full_aggrs: vec![aggr_expr.clone()],
-                        simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
+                        simple_aggrs: vec![(0, 0, aggr_expr.clone())],
                        distinct_aggrs: vec![],
                    }),
                }),
@@ -387,7 +379,7 @@ mod test {
                    },
                    reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
                        full_aggrs: vec![aggr_expr.clone()],
-                        simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
+                        simple_aggrs: vec![(0, 0, aggr_expr.clone())],
                        distinct_aggrs: vec![],
                    }),
                }),
@@ -438,7 +430,7 @@ mod test {
                    },
                    reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
                        full_aggrs: vec![aggr_expr.clone()],
-                        simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
+                        simple_aggrs: vec![(0, 0, aggr_expr.clone())],
                        distinct_aggrs: vec![],
                    }),
                }),
--- a/src/flow/src/utils.rs
+++ b/src/flow/src/utils.rs
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -650,7 +650,7 @@ mod tests {
                            ts TIMESTAMP,
                            TIME INDEX (ts),
                            PRIMARY KEY(host)
-                        ) engine=mito;"#;
+                        ) engine=mito with(regions=1);"#;
        replace_test(sql, plugins.clone(), &query_ctx);
        // test drop table
--- a/src/meta-srv/src/service/store/cached_kv.rs
+++ b/src/meta-srv/src/service/store/cached_kv.rs
@@ -102,10 +102,15 @@ impl LeaderCachedKvBackend {
                self.store.clone(),
                RangeRequest::new().with_prefix(prefix.as_bytes()),
                DEFAULT_PAGE_SIZE,
-                Arc::new(Ok),
+                Arc::new(|kv| Ok((kv, ()))),
            );
-            let kvs = stream.try_collect::<Vec<_>>().await?.into_iter().collect();
+            let kvs = stream
                .try_collect::<Vec<_>>()
                .await?
                .into_iter()
                .map(|(kv, _)| kv)
                .collect();
            self.cache
                .batch_put(BatchPutRequest {
--- a/src/metric-engine/src/data_region.rs
+++ b/src/metric-engine/src/data_region.rs
@@ -25,7 +25,7 @@ use store_api::region_request::{
    AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionPutRequest, RegionRequest,
 };
 use store_api::storage::consts::ReservedColumnId;
-use store_api::storage::{ConcreteDataType, RegionId};
+use store_api::storage::RegionId;
 use crate::error::{
    ColumnTypeMismatchSnafu, MitoReadOperationSnafu, MitoWriteOperationSnafu, Result,
@@ -128,8 +128,7 @@ impl DataRegion {
                if c.semantic_type == SemanticType::Tag {
                    if !c.column_schema.data_type.is_string() {
                        return ColumnTypeMismatchSnafu {
-                            expect: ConcreteDataType::string_datatype(),
+                            column_type: c.column_schema.data_type.clone(),
                            actual: c.column_schema.data_type.clone(),
                        }
                        .fail();
                    }
--- a/src/metric-engine/src/engine/create.rs
+++ b/src/metric-engine/src/engine/create.rs
@@ -43,11 +43,9 @@ use crate::engine::options::{
 };
 use crate::engine::MetricEngineInner;
 use crate::error::{
-    AddingFieldColumnSnafu, ColumnNotFoundSnafu, ColumnTypeMismatchSnafu,
+    ColumnNotFoundSnafu, ConflictRegionOptionSnafu, CreateMitoRegionSnafu,
-    ConflictRegionOptionSnafu, CreateMitoRegionSnafu, InternalColumnOccupiedSnafu,
+    InternalColumnOccupiedSnafu, MissingRegionOptionSnafu, MitoReadOperationSnafu,
-    InvalidMetadataSnafu, MissingRegionOptionSnafu, MitoReadOperationSnafu,
+    ParseRegionIdSnafu, PhysicalRegionNotFoundSnafu, Result, SerializeColumnMetadataSnafu,
    MultipleFieldColumnSnafu, NoFieldColumnSnafu, ParseRegionIdSnafu, PhysicalRegionNotFoundSnafu,
    Result, SerializeColumnMetadataSnafu,
 };
 use crate::metrics::{LOGICAL_REGION_COUNT, PHYSICAL_COLUMN_COUNT, PHYSICAL_REGION_COUNT};
 use crate::utils::{to_data_region_id, to_metadata_region_id};
@@ -193,14 +191,6 @@ impl MetricEngineInner {
                    })?;
            for col in &request.column_metadatas {
                if !physical_columns.contains(&col.column_schema.name) {
                    // Multi-field on physical table is explicit forbidden at present
                    // TODO(ruihang): support multi-field on both logical and physical column
                    ensure!(
                        col.semantic_type != SemanticType::Field,
                        AddingFieldColumnSnafu {
                            name: col.column_schema.name.clone()
                        }
                    );
                    new_columns.push(col.clone());
                } else {
                    existing_columns.push(col.column_schema.name.clone());
@@ -300,8 +290,6 @@ impl MetricEngineInner {
    /// - required table option is present ([PHYSICAL_TABLE_METADATA_KEY] or
    ///   [LOGICAL_TABLE_METADATA_KEY])
    fn verify_region_create_request(request: &RegionCreateRequest) -> Result<()> {
        request.validate().context(InvalidMetadataSnafu)?;
        let name_to_index = request
            .column_metadatas
            .iter()
@@ -335,41 +323,6 @@ impl MetricEngineInner {
            ConflictRegionOptionSnafu {}
        );
        // check if only one field column is declared, and all tag columns are string
        let mut field_col: Option<&ColumnMetadata> = None;
        for col in &request.column_metadatas {
            match col.semantic_type {
                SemanticType::Tag => ensure!(
                    col.column_schema.data_type == ConcreteDataType::string_datatype(),
                    ColumnTypeMismatchSnafu {
                        expect: ConcreteDataType::string_datatype(),
                        actual: col.column_schema.data_type.clone(),
                    }
                ),
                SemanticType::Field => {
                    if field_col.is_some() {
                        MultipleFieldColumnSnafu {
                            previous: field_col.unwrap().column_schema.name.clone(),
                            current: col.column_schema.name.clone(),
                        }
                        .fail()?;
                    }
                    field_col = Some(col)
                }
                SemanticType::Timestamp => {}
            }
        }
        let field_col = field_col.context(NoFieldColumnSnafu)?;
        // make sure the field column is float64 type
        ensure!(
            field_col.column_schema.data_type == ConcreteDataType::float64_datatype(),
            ColumnTypeMismatchSnafu {
                expect: ConcreteDataType::float64_datatype(),
                actual: field_col.column_schema.data_type.clone(),
            }
        );
        Ok(())
    }
@@ -578,15 +531,6 @@ mod test {
                        false,
                    ),
                },
                ColumnMetadata {
                    column_id: 2,
                    semantic_type: SemanticType::Field,
                    column_schema: ColumnSchema::new(
                        "column2".to_string(),
                        ConcreteDataType::float64_datatype(),
                        false,
                    ),
                },
            ],
            region_dir: "test_dir".to_string(),
            engine: METRIC_ENGINE_NAME.to_string(),
@@ -595,51 +539,37 @@ mod test {
                .into_iter()
                .collect(),
        };
-        MetricEngineInner::verify_region_create_request(&request).unwrap();
+        let result = MetricEngineInner::verify_region_create_request(&request);
        assert!(result.is_ok());
    }
    #[test]
    fn test_verify_region_create_request_options() {
        let mut request = RegionCreateRequest {
-            column_metadatas: vec![
+            column_metadatas: vec![],
                ColumnMetadata {
                    column_id: 0,
                    semantic_type: SemanticType::Timestamp,
                    column_schema: ColumnSchema::new(
                        METADATA_SCHEMA_TIMESTAMP_COLUMN_NAME,
                        ConcreteDataType::timestamp_millisecond_datatype(),
                        false,
                    ),
                },
                ColumnMetadata {
                    column_id: 1,
                    semantic_type: SemanticType::Field,
                    column_schema: ColumnSchema::new(
                        "val".to_string(),
                        ConcreteDataType::float64_datatype(),
                        false,
                    ),
                },
            ],
            region_dir: "test_dir".to_string(),
            engine: METRIC_ENGINE_NAME.to_string(),
            primary_key: vec![],
            options: HashMap::new(),
        };
-        MetricEngineInner::verify_region_create_request(&request).unwrap_err();
+        let result = MetricEngineInner::verify_region_create_request(&request);
        assert!(result.is_err());
        let mut options = HashMap::new();
        options.insert(PHYSICAL_TABLE_METADATA_KEY.to_string(), "value".to_string());
        request.options.clone_from(&options);
-        MetricEngineInner::verify_region_create_request(&request).unwrap();
+        let result = MetricEngineInner::verify_region_create_request(&request);
        assert!(result.is_ok());
        options.insert(LOGICAL_TABLE_METADATA_KEY.to_string(), "value".to_string());
        request.options.clone_from(&options);
-        MetricEngineInner::verify_region_create_request(&request).unwrap_err();
+        let result = MetricEngineInner::verify_region_create_request(&request);
        assert!(result.is_err());
        options.remove(PHYSICAL_TABLE_METADATA_KEY).unwrap();
        request.options = options;
-        MetricEngineInner::verify_region_create_request(&request).unwrap();
+        let result = MetricEngineInner::verify_region_create_request(&request);
        assert!(result.is_ok());
    }
    #[tokio::test]
--- a/src/metric-engine/src/error.rs
+++ b/src/metric-engine/src/error.rs
@@ -133,10 +133,9 @@ pub enum Error {
        location: Location,
    },
-    #[snafu(display("Column type mismatch. Expect {:?}, got {:?}", expect, actual))]
+    #[snafu(display("Column type mismatch. Expect string, got {:?}", column_type))]
    ColumnTypeMismatch {
-        expect: ConcreteDataType,
+        column_type: ConcreteDataType,
        actual: ConcreteDataType,
        location: Location,
    },
@@ -170,19 +169,6 @@ pub enum Error {
        request: RegionRequest,
        location: Location,
    },
    #[snafu(display("Multiple field column found: {} and {}", previous, current))]
    MultipleFieldColumn {
        previous: String,
        current: String,
        location: Location,
    },
    #[snafu(display("Adding field column {} to physical table", name))]
    AddingFieldColumn { name: String, location: Location },
    #[snafu(display("No field column found"))]
    NoFieldColumn { location: Location },
 }
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -196,10 +182,7 @@ impl ErrorExt for Error {
            | MissingRegionOption { .. }
            | ConflictRegionOption { .. }
            | ColumnTypeMismatch { .. }
-            | PhysicalRegionBusy { .. }
+            | PhysicalRegionBusy { .. } => StatusCode::InvalidArguments,
            | MultipleFieldColumn { .. }
            | NoFieldColumn { .. }
            | AddingFieldColumn { .. } => StatusCode::InvalidArguments,
            ForbiddenPhysicalAlter { .. } | UnsupportedRegionRequest { .. } => {
                StatusCode::Unsupported
--- a/src/metric-engine/src/test_util.rs
+++ b/src/metric-engine/src/test_util.rs
@@ -210,9 +210,9 @@ pub fn create_logical_region_request(
            ),
        },
    ];
-    for (bias, tag) in tags.iter().enumerate() {
+    for tag in tags {
        column_metadatas.push(ColumnMetadata {
-            column_id: 2 + bias as ColumnId,
+            column_id: 2,
            semantic_type: SemanticType::Tag,
            column_schema: ColumnSchema::new(
                tag.to_string(),
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -32,7 +32,6 @@ common-test-util = { workspace = true, optional = true }
 common-time.workspace = true
 common-wal.workspace = true
 crc32fast = "1"
 crossbeam-utils.workspace = true
 datafusion.workspace = true
 datafusion-common.workspace = true
 datafusion-expr.workspace = true
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -37,11 +37,9 @@ use crate::error::{
 use crate::metrics::COMPACTION_STAGE_ELAPSED;
 use crate::region::options::CompactionOptions;
 use crate::region::version::{VersionControlRef, VersionRef};
 use crate::region::ManifestContextRef;
 use crate::request::{OptionOutputTx, OutputTx, WorkerRequest};
 use crate::schedule::scheduler::SchedulerRef;
 use crate::sst::file_purger::FilePurgerRef;
 use crate::worker::WorkerListener;
 /// Region compaction request.
 pub struct CompactionRequest {
@@ -56,9 +54,6 @@ pub struct CompactionRequest {
    /// Start time of compaction task.
    pub(crate) start_time: Instant,
    pub(crate) cache_manager: CacheManagerRef,
    pub(crate) manifest_ctx: ManifestContextRef,
    pub(crate) version_control: VersionControlRef,
    pub(crate) listener: WorkerListener,
 }
 impl CompactionRequest {
@@ -93,8 +88,6 @@ pub(crate) struct CompactionScheduler {
    /// Request sender of the worker that this scheduler belongs to.
    request_sender: Sender<WorkerRequest>,
    cache_manager: CacheManagerRef,
    engine_config: Arc<MitoConfig>,
    listener: WorkerListener,
 }
 impl CompactionScheduler {
@@ -102,16 +95,12 @@ impl CompactionScheduler {
        scheduler: SchedulerRef,
        request_sender: Sender<WorkerRequest>,
        cache_manager: CacheManagerRef,
        engine_config: Arc<MitoConfig>,
        listener: WorkerListener,
    ) -> Self {
        Self {
            scheduler,
            region_status: HashMap::new(),
            request_sender,
            cache_manager,
            engine_config,
            listener,
        }
    }
@@ -123,7 +112,7 @@ impl CompactionScheduler {
        access_layer: &AccessLayerRef,
        file_purger: &FilePurgerRef,
        waiter: OptionOutputTx,
-        manifest_ctx: &ManifestContextRef,
+        engine_config: Arc<MitoConfig>,
    ) -> Result<()> {
        if let Some(status) = self.region_status.get_mut(&region_id) {
            // Region is compacting. Add the waiter to pending list.
@@ -141,10 +130,8 @@ impl CompactionScheduler {
        let request = status.new_compaction_request(
            self.request_sender.clone(),
            waiter,
-            self.engine_config.clone(),
+            engine_config,
            self.cache_manager.clone(),
            manifest_ctx,
            self.listener.clone(),
        );
        self.region_status.insert(region_id, status);
        self.schedule_compaction_request(request)
@@ -154,7 +141,7 @@ impl CompactionScheduler {
    pub(crate) fn on_compaction_finished(
        &mut self,
        region_id: RegionId,
-        manifest_ctx: &ManifestContextRef,
+        engine_config: Arc<MitoConfig>,
    ) {
        let Some(status) = self.region_status.get_mut(&region_id) else {
            return;
@@ -163,10 +150,8 @@ impl CompactionScheduler {
        let request = status.new_compaction_request(
            self.request_sender.clone(),
            OptionOutputTx::none(),
-            self.engine_config.clone(),
+            engine_config,
            self.cache_manager.clone(),
            manifest_ctx,
            self.listener.clone(),
        );
        // Try to schedule next compaction task for this region.
        if let Err(e) = self.schedule_compaction_request(request) {
@@ -340,8 +325,6 @@ impl CompactionStatus {
        waiter: OptionOutputTx,
        engine_config: Arc<MitoConfig>,
        cache_manager: CacheManagerRef,
        manifest_ctx: &ManifestContextRef,
        listener: WorkerListener,
    ) -> CompactionRequest {
        let current_version = self.version_control.current().version;
        let start_time = Instant::now();
@@ -354,9 +337,6 @@ impl CompactionStatus {
            file_purger: self.file_purger.clone(),
            start_time,
            cache_manager,
            manifest_ctx: manifest_ctx.clone(),
            version_control: self.version_control.clone(),
            listener,
        };
        if let Some(pending) = self.pending_compaction.take() {
@@ -391,9 +371,6 @@ mod tests {
        let version_control = Arc::new(builder.build());
        let (output_tx, output_rx) = oneshot::channel();
        let waiter = OptionOutputTx::from(output_tx);
        let manifest_ctx = env
            .mock_manifest_context(version_control.current().version.metadata.clone())
            .await;
        scheduler
            .schedule_compaction(
                builder.region_id(),
@@ -401,7 +378,7 @@ mod tests {
                &env.access_layer,
                &purger,
                waiter,
-                &manifest_ctx,
+                Arc::new(MitoConfig::default()),
            )
            .unwrap();
        let output = output_rx.await.unwrap().unwrap();
@@ -419,7 +396,7 @@ mod tests {
                &env.access_layer,
                &purger,
                waiter,
-                &manifest_ctx,
+                Arc::new(MitoConfig::default()),
            )
            .unwrap();
        let output = output_rx.await.unwrap().unwrap();
@@ -471,9 +448,6 @@ mod tests {
                .push_l0_file(90, end)
                .build(),
        );
        let manifest_ctx = env
            .mock_manifest_context(version_control.current().version.metadata.clone())
            .await;
        scheduler
            .schedule_compaction(
                region_id,
@@ -481,7 +455,7 @@ mod tests {
                &env.access_layer,
                &purger,
                OptionOutputTx::none(),
-                &manifest_ctx,
+                Arc::new(MitoConfig::default()),
            )
            .unwrap();
        // Should schedule 1 compaction.
@@ -509,7 +483,7 @@ mod tests {
                &env.access_layer,
                &purger,
                OptionOutputTx::none(),
-                &manifest_ctx,
+                Arc::new(MitoConfig::default()),
            )
            .unwrap();
        assert_eq!(1, scheduler.region_status.len());
@@ -522,7 +496,7 @@ mod tests {
            .is_some());
        // On compaction finished and schedule next compaction.
-        scheduler.on_compaction_finished(region_id, &manifest_ctx);
+        scheduler.on_compaction_finished(region_id, Arc::new(MitoConfig::default()));
        assert_eq!(1, scheduler.region_status.len());
        assert_eq!(2, job_scheduler.num_jobs());
        // 5 files for next compaction.
@@ -540,7 +514,7 @@ mod tests {
                &env.access_layer,
                &purger,
                OptionOutputTx::none(),
-                &manifest_ctx,
+                Arc::new(MitoConfig::default()),
            )
            .unwrap();
        assert_eq!(2, job_scheduler.num_jobs());
--- a/src/mito2/src/compaction/twcs.rs
+++ b/src/mito2/src/compaction/twcs.rs
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use std::collections::hash_map::Entry;
+use std::collections::BTreeMap;
 use std::collections::{BTreeMap, HashMap};
 use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
@@ -34,15 +33,12 @@ use crate::compaction::picker::{CompactionTask, Picker};
 use crate::compaction::CompactionRequest;
 use crate::config::MitoConfig;
 use crate::error::{self, CompactRegionSnafu};
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED};
 use crate::read::projection::ProjectionMapper;
 use crate::read::scan_region::ScanInput;
 use crate::read::seq_scan::SeqScan;
 use crate::read::{BoxedBatchReader, Source};
 use crate::region::options::IndexOptions;
 use crate::region::version::VersionControlRef;
 use crate::region::{ManifestContextRef, RegionState};
 use crate::request::{
    BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, WorkerRequest,
 };
@@ -50,7 +46,6 @@ use crate::sst::file::{FileHandle, FileId, FileMeta, IndexType, Level};
 use crate::sst::file_purger::FilePurgerRef;
 use crate::sst::parquet::WriteOptions;
 use crate::sst::version::LevelMeta;
 use crate::worker::WorkerListener;
 const MAX_PARALLEL_COMPACTION: usize = 8;
@@ -89,41 +84,35 @@ impl TwcsPicker {
    /// fragmentation. For other windows, we allow at most 1 file at each window.
    fn build_output(
        &self,
-        time_windows: &BTreeMap<i64, Window>,
+        time_windows: &BTreeMap<i64, Vec<FileHandle>>,
        active_window: Option<i64>,
    ) -> Vec<CompactionOutput> {
        let mut output = vec![];
        for (window, files) in time_windows {
            let files_in_window = &files.files;
            // we only remove deletion markers once no file in current window overlaps with any other window.
            let filter_deleted = !files.overlapping;
            if let Some(active_window) = active_window
                && *window == active_window
            {
-                if files_in_window.len() > self.max_active_window_files {
+                if files.len() > self.max_active_window_files {
                    output.push(CompactionOutput {
                        output_file_id: FileId::random(),
                        output_level: 1, // we only have two levels and always compact to l1
-                        inputs: files_in_window.clone(),
+                        inputs: files.clone(),
                        filter_deleted,
                    });
                } else {
                    debug!("Active window not present or no enough files in active window {:?}, window: {}", active_window, *window);
                }
            } else {
                // not active writing window
-                if files_in_window.len() > self.max_inactive_window_files {
+                if files.len() > self.max_inactive_window_files {
                    output.push(CompactionOutput {
                        output_file_id: FileId::random(),
                        output_level: 1,
-                        inputs: files_in_window.clone(),
+                        inputs: files.clone(),
                        filter_deleted,
                    });
                } else {
                    debug!(
                        "No enough files, current: {}, max_inactive_window_files: {}",
-                        files_in_window.len(),
+                        files.len(),
                        self.max_inactive_window_files
                    )
                }
@@ -144,9 +133,6 @@ impl Picker for TwcsPicker {
            file_purger,
            start_time,
            cache_manager,
            manifest_ctx,
            version_control,
            listener,
        } = req;
        let region_metadata = current_version.metadata.clone();
@@ -204,107 +190,29 @@ impl Picker for TwcsPicker {
            storage: current_version.options.storage.clone(),
            index_options: current_version.options.index_options.clone(),
            append_mode: current_version.options.append_mode,
            manifest_ctx,
            version_control,
            listener,
        };
        Some(Box::new(task))
    }
 }
 struct Window {
    start: Timestamp,
    end: Timestamp,
    files: Vec<FileHandle>,
    time_window: i64,
    overlapping: bool,
 }
 impl Window {
    /// Creates a new [Window] with given file.
    fn new_with_file(file: FileHandle) -> Self {
        let (start, end) = file.time_range();
        Self {
            start,
            end,
            files: vec![file],
            time_window: 0,
            overlapping: false,
        }
    }
    /// Returns the time range of all files in current window (inclusive).
    fn range(&self) -> (Timestamp, Timestamp) {
        (self.start, self.end)
    }
    /// Adds a new file to window and updates time range.
    fn add_file(&mut self, file: FileHandle) {
        let (start, end) = file.time_range();
        self.start = self.start.min(start);
        self.end = self.end.max(end);
        self.files.push(file);
    }
 }
 /// Assigns files to windows with predefined window size (in seconds) by their max timestamps.
 fn assign_to_windows<'a>(
    files: impl Iterator<Item = &'a FileHandle>,
    time_window_size: i64,
-) -> BTreeMap<i64, Window> {
+) -> BTreeMap<i64, Vec<FileHandle>> {
-    let mut windows: HashMap<i64, Window> = HashMap::new();
+    let mut windows: BTreeMap<i64, Vec<FileHandle>> = BTreeMap::new();
    // Iterates all files and assign to time windows according to max timestamp
-    for f in files {
+    for file in files {
-        let (_, end) = f.time_range();
+        let (_, end) = file.time_range();
        let time_window = end
            .convert_to(TimeUnit::Second)
            .unwrap()
            .value()
            .align_to_ceil_by_bucket(time_window_size)
            .unwrap_or(i64::MIN);
-
+        windows.entry(time_window).or_default().push(file.clone());
        match windows.entry(time_window) {
            Entry::Occupied(mut e) => {
                e.get_mut().add_file(f.clone());
    }
-            Entry::Vacant(e) => {
+    windows
                let mut window = Window::new_with_file(f.clone());
                window.time_window = time_window;
                e.insert(window);
            }
        }
    }
    if windows.is_empty() {
        return BTreeMap::new();
    }
    let mut windows = windows.into_values().collect::<Vec<_>>();
    windows.sort_unstable_by(|l, r| l.start.cmp(&r.start).then(l.end.cmp(&r.end).reverse()));
    let mut current_range: (Timestamp, Timestamp) = windows[0].range(); // windows cannot be empty.
    for idx in 1..windows.len() {
        let next_range = windows[idx].range();
        if overlaps(&current_range, &next_range) {
            windows[idx - 1].overlapping = true;
            windows[idx].overlapping = true;
        }
        current_range = (
            current_range.0.min(next_range.0),
            current_range.1.max(next_range.1),
        );
    }
    windows.into_iter().map(|w| (w.time_window, w)).collect()
 }
 /// Checks if two inclusive timestamp ranges overlap with each other.
 fn overlaps(l: &(Timestamp, Timestamp), r: &(Timestamp, Timestamp)) -> bool {
    let (l, r) = if l.0 <= r.0 { (l, r) } else { (r, l) };
    let (_, l_end) = l;
    let (r_start, _) = r;
    r_start <= l_end
 }
 /// Finds the latest active writing window among all files.
@@ -351,12 +259,6 @@ pub(crate) struct TwcsCompactionTask {
    pub(crate) index_options: IndexOptions,
    /// The region is using append mode.
    pub(crate) append_mode: bool,
    /// Manifest context.
    pub(crate) manifest_ctx: ManifestContextRef,
    /// Version control to update.
    pub(crate) version_control: VersionControlRef,
    /// Event listener.
    pub(crate) listener: WorkerListener,
 }
 impl Debug for TwcsCompactionTask {
@@ -442,7 +344,6 @@ impl TwcsCompactionTask {
                    sst_layer.clone(),
                    &output.inputs,
                    append_mode,
                    output.filter_deleted,
                )
                .await?;
                let file_meta_opt = sst_layer
@@ -497,55 +398,18 @@ impl TwcsCompactionTask {
        Ok((output_files, inputs))
    }
-    async fn handle_compaction(&mut self) -> error::Result<()> {
+    async fn handle_compaction(&mut self) -> error::Result<(Vec<FileMeta>, Vec<FileMeta>)> {
        self.mark_files_compacting(true);
        let merge_timer = COMPACTION_STAGE_ELAPSED
            .with_label_values(&["merge"])
            .start_timer();
-        let (added, mut deleted) = match self.merge_ssts().await {
+        let (output, mut compacted) = self.merge_ssts().await.map_err(|e| {
            Ok(v) => v,
            Err(e) => {
            error!(e; "Failed to compact region: {}", self.region_id);
            merge_timer.stop_and_discard();
-                return Err(e);
+            e
-            }
+        })?;
-        };
+        compacted.extend(self.expired_ssts.iter().map(FileHandle::meta));
-        deleted.extend(self.expired_ssts.iter().map(FileHandle::meta));
+        Ok((output, compacted))
        let merge_time = merge_timer.stop_and_record();
        info!(
            "Compacted SST files, region_id: {}, input: {:?}, output: {:?}, window: {:?}, waiter_num: {}, merge_time: {}s",
            self.region_id,
            deleted,
            added,
            self.compaction_time_window,
            self.waiters.len(),
            merge_time,
        );
        self.listener.on_merge_ssts_finished(self.region_id).await;
        let _manifest_timer = COMPACTION_STAGE_ELAPSED
            .with_label_values(&["write_manifest"])
            .start_timer();
        // Write region edit to manifest.
        let edit = RegionEdit {
            files_to_add: added,
            files_to_remove: deleted,
            compaction_time_window: self
                .compaction_time_window
                .map(|seconds| Duration::from_secs(seconds as u64)),
            flushed_entry_id: None,
            flushed_sequence: None,
        };
        let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
        // We might leak files if we fail to update manifest. We can add a cleanup task to
        // remove them later.
        self.manifest_ctx
            .update_manifest(RegionState::Writable, action_list, || {
                self.version_control
                    .apply_edit(edit, &[], self.file_purger.clone());
            })
            .await
    }
    /// Handles compaction failure, notifies all waiters.
@@ -573,11 +437,27 @@ impl TwcsCompactionTask {
 impl CompactionTask for TwcsCompactionTask {
    async fn run(&mut self) {
        let notify = match self.handle_compaction().await {
-            Ok(()) => BackgroundNotify::CompactionFinished(CompactionFinished {
+            Ok((added, deleted)) => {
                info!(
                    "Compacted SST files, input: {:?}, output: {:?}, window: {:?}, waiter_num: {}",
                    deleted,
                    added,
                    self.compaction_time_window,
                    self.waiters.len(),
                );
                BackgroundNotify::CompactionFinished(CompactionFinished {
                    region_id: self.region_id,
                    compaction_outputs: added,
                    compacted_files: deleted,
                    senders: std::mem::take(&mut self.waiters),
                    file_purger: self.file_purger.clone(),
                    compaction_time_window: self
                        .compaction_time_window
                        .map(|seconds| Duration::from_secs(seconds as u64)),
                    start_time: self.start_time,
-            }),
+                })
            }
            Err(e) => {
                error!(e; "Failed to compact region, region id: {}", self.region_id);
                let err = Arc::new(e);
@@ -692,8 +572,6 @@ pub(crate) struct CompactionOutput {
    pub output_level: Level,
    /// Compaction input files.
    pub inputs: Vec<FileHandle>,
    /// Whether to remove deletion markers.
    pub filter_deleted: bool,
 }
 /// Builds [BoxedBatchReader] that reads all SST files and yields batches in primary key order.
@@ -702,12 +580,10 @@ async fn build_sst_reader(
    sst_layer: AccessLayerRef,
    inputs: &[FileHandle],
    append_mode: bool,
    filter_deleted: bool,
 ) -> error::Result<BoxedBatchReader> {
    let scan_input = ScanInput::new(sst_layer, ProjectionMapper::all(&metadata)?)
        .with_files(inputs.to_vec())
        .with_append_mode(append_mode)
        .with_filter_deleted(filter_deleted)
        // We ignore file not found error during compaction.
        .with_ignore_file_not_found(true);
    SeqScan::new(scan_input).build_reader().await
@@ -766,7 +642,7 @@ mod tests {
            .iter(),
            3,
        );
-        assert_eq!(5, windows.get(&0).unwrap().files.len());
+        assert_eq!(5, windows.get(&0).unwrap().len());
        let files = [FileId::random(); 3];
        let windows = assign_to_windows(
@@ -780,148 +656,15 @@ mod tests {
        );
        assert_eq!(
            files[0],
-            windows.get(&0).unwrap().files.first().unwrap().file_id()
+            windows.get(&0).unwrap().first().unwrap().file_id()
        );
        assert_eq!(
            files[1],
-            windows.get(&3).unwrap().files.first().unwrap().file_id()
+            windows.get(&3).unwrap().first().unwrap().file_id()
        );
        assert_eq!(
            files[2],
-            windows.get(&12).unwrap().files.first().unwrap().file_id()
+            windows.get(&12).unwrap().first().unwrap().file_id()
        );
    }
    /// (Window value, overlapping, files' time ranges in window)
    type ExpectedWindowSpec = (i64, bool, Vec<(i64, i64)>);
    fn check_assign_to_windows_with_overlapping(
        file_time_ranges: &[(i64, i64)],
        time_window: i64,
        expected_files: &[ExpectedWindowSpec],
    ) {
        let files: Vec<_> = (0..file_time_ranges.len())
            .map(|_| FileId::random())
            .collect();
        let file_handles = files
            .iter()
            .zip(file_time_ranges.iter())
            .map(|(file_id, range)| new_file_handle(*file_id, range.0, range.1, 0))
            .collect::<Vec<_>>();
        let windows = assign_to_windows(file_handles.iter(), time_window);
        for (expected_window, overlapping, window_files) in expected_files {
            let actual_window = windows.get(expected_window).unwrap();
            assert_eq!(*overlapping, actual_window.overlapping);
            let mut file_ranges = actual_window
                .files
                .iter()
                .map(|f| {
                    let (s, e) = f.time_range();
                    (s.value(), e.value())
                })
                .collect::<Vec<_>>();
            file_ranges.sort_unstable_by(|l, r| l.0.cmp(&r.0).then(l.1.cmp(&r.1)));
            assert_eq!(window_files, &file_ranges);
        }
    }
    #[test]
    fn test_assign_to_windows_with_overlapping() {
        check_assign_to_windows_with_overlapping(
            &[(0, 999), (1000, 1999), (2000, 2999)],
            2,
            &[
                (0, false, vec![(0, 999)]),
                (2, false, vec![(1000, 1999), (2000, 2999)]),
            ],
        );
        check_assign_to_windows_with_overlapping(
            &[(0, 1), (0, 999), (100, 2999)],
            2,
            &[
                (0, true, vec![(0, 1), (0, 999)]),
                (2, true, vec![(100, 2999)]),
            ],
        );
        check_assign_to_windows_with_overlapping(
            &[(0, 999), (1000, 1999), (2000, 2999), (3000, 3999)],
            2,
            &[
                (0, false, vec![(0, 999)]),
                (2, false, vec![(1000, 1999), (2000, 2999)]),
                (4, false, vec![(3000, 3999)]),
            ],
        );
        check_assign_to_windows_with_overlapping(
            &[
                (0, 999),
                (1000, 1999),
                (2000, 2999),
                (3000, 3999),
                (0, 3999),
            ],
            2,
            &[
                (0, true, vec![(0, 999)]),
                (2, true, vec![(1000, 1999), (2000, 2999)]),
                (4, true, vec![(0, 3999), (3000, 3999)]),
            ],
        );
        check_assign_to_windows_with_overlapping(
            &[
                (0, 999),
                (1000, 1999),
                (2000, 2999),
                (3000, 3999),
                (1999, 3999),
            ],
            2,
            &[
                (0, false, vec![(0, 999)]),
                (2, true, vec![(1000, 1999), (2000, 2999)]),
                (4, true, vec![(1999, 3999), (3000, 3999)]),
            ],
        );
        check_assign_to_windows_with_overlapping(
            &[
                (0, 999),     // window 0
                (1000, 1999), // window 2
                (2000, 2999), // window 2
                (3000, 3999), // window 4
                (2999, 3999), // window 4
            ],
            2,
            &[
                // window 2 overlaps with window 4
                (0, false, vec![(0, 999)]),
                (2, true, vec![(1000, 1999), (2000, 2999)]),
                (4, true, vec![(2999, 3999), (3000, 3999)]),
            ],
        );
        check_assign_to_windows_with_overlapping(
            &[
                (0, 999),     // window 0
                (1000, 1999), // window 2
                (2000, 2999), // window 2
                (3000, 3999), // window 4
                (0, 1000),    // // window 2
            ],
            2,
            &[
                // only window 0 overlaps with window 2.
                (0, true, vec![(0, 999)]),
                (2, true, vec![(0, 1000), (1000, 1999), (2000, 2999)]),
                (4, false, vec![(3000, 3999)]),
            ],
        );
    }
--- a/src/mito2/src/engine/catchup_test.rs
+++ b/src/mito2/src/engine/catchup_test.rs
@@ -345,7 +345,7 @@ async fn test_catchup_with_manifest_update() {
    // Ensures the mutable is empty.
    assert!(region.version().memtables.mutable.is_empty());
-    let manifest = region.manifest_ctx.manifest().await;
+    let manifest = region.manifest_manager.read().await.manifest();
    assert_eq!(manifest.manifest_version, 0);
    let resp = follower_engine
@@ -361,7 +361,7 @@ async fn test_catchup_with_manifest_update() {
    // The inner region was replaced. We must get it again.
    let region = follower_engine.get_region(region_id).unwrap();
-    let manifest = region.manifest_ctx.manifest().await;
+    let manifest = region.manifest_manager.read().await.manifest();
    assert_eq!(manifest.manifest_version, 2);
    assert!(!region.is_writable());
--- a/src/mito2/src/engine/compaction_test.rs
+++ b/src/mito2/src/engine/compaction_test.rs
@@ -149,102 +149,6 @@ async fn test_compaction_region() {
    assert_eq!((0..25).map(|v| v * 1000).collect::<Vec<_>>(), vec);
 }
 #[tokio::test]
 async fn test_compaction_region_with_overlapping() {
    common_telemetry::init_default_ut_logging();
    let mut env = TestEnv::new();
    let engine = env.create_engine(MitoConfig::default()).await;
    let region_id = RegionId::new(1, 1);
    let request = CreateRequestBuilder::new()
        .insert_option("compaction.type", "twcs")
        .insert_option("compaction.twcs.max_active_window_files", "2")
        .insert_option("compaction.twcs.max_inactive_window_files", "2")
        .insert_option("compaction.twcs.time_window", "1h")
        .build();
    let column_schemas = request
        .column_metadatas
        .iter()
        .map(column_metadata_to_column_schema)
        .collect::<Vec<_>>();
    engine
        .handle_request(region_id, RegionRequest::Create(request))
        .await
        .unwrap();
    // Flush 4 SSTs for compaction.
    put_and_flush(&engine, region_id, &column_schemas, 0..1200).await; // window 3600
    put_and_flush(&engine, region_id, &column_schemas, 0..2400).await; // window 3600
    put_and_flush(&engine, region_id, &column_schemas, 3600..10800).await; // window 10800
    delete_and_flush(&engine, region_id, &column_schemas, 0..3600).await; // window 3600
    let result = engine
        .handle_request(region_id, RegionRequest::Compact(RegionCompactRequest {}))
        .await
        .unwrap();
    assert_eq!(result.affected_rows, 0);
    let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
    assert_eq!(
        2,
        scanner.num_files(),
        "unexpected files: {:?}",
        scanner.file_ids()
    );
    let stream = scanner.scan().await.unwrap();
    let vec = collect_stream_ts(stream).await;
    assert_eq!((3600..10800).map(|i| { i * 1000 }).collect::<Vec<_>>(), vec);
 }
 #[tokio::test]
 async fn test_compaction_region_with_overlapping_delete_all() {
    common_telemetry::init_default_ut_logging();
    let mut env = TestEnv::new();
    let engine = env.create_engine(MitoConfig::default()).await;
    let region_id = RegionId::new(1, 1);
    let request = CreateRequestBuilder::new()
        .insert_option("compaction.type", "twcs")
        .insert_option("compaction.twcs.max_active_window_files", "2")
        .insert_option("compaction.twcs.max_inactive_window_files", "2")
        .insert_option("compaction.twcs.time_window", "1h")
        .build();
    let column_schemas = request
        .column_metadatas
        .iter()
        .map(column_metadata_to_column_schema)
        .collect::<Vec<_>>();
    engine
        .handle_request(region_id, RegionRequest::Create(request))
        .await
        .unwrap();
    // Flush 4 SSTs for compaction.
    put_and_flush(&engine, region_id, &column_schemas, 0..1200).await; // window 3600
    put_and_flush(&engine, region_id, &column_schemas, 0..2400).await; // window 3600
    put_and_flush(&engine, region_id, &column_schemas, 0..3600).await; // window 3600
    delete_and_flush(&engine, region_id, &column_schemas, 0..10800).await; // window 10800
    let result = engine
        .handle_request(region_id, RegionRequest::Compact(RegionCompactRequest {}))
        .await
        .unwrap();
    assert_eq!(result.affected_rows, 0);
    let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
    assert_eq!(
        4,
        scanner.num_files(),
        "unexpected files: {:?}",
        scanner.file_ids()
    );
    let stream = scanner.scan().await.unwrap();
    let vec = collect_stream_ts(stream).await;
    assert!(vec.is_empty());
 }
 // For issue https://github.com/GreptimeTeam/greptimedb/issues/3633
 #[tokio::test]
 async fn test_readonly_during_compaction() {
--- a/src/mito2/src/engine/listener.rs
+++ b/src/mito2/src/engine/listener.rs
@@ -51,9 +51,9 @@ pub trait EventListener: Send + Sync {
        let _ = removed;
    }
-    /// Notifies the listener that ssts has been merged and the region
+    /// Notifies the listener that the region is going to handle the compaction
-    /// is going to update its manifest.
+    /// finished request.
-    async fn on_merge_ssts_finished(&self, region_id: RegionId) {
+    async fn on_handle_compaction_finished(&self, region_id: RegionId) {
        let _ = region_id;
    }
 }
@@ -201,7 +201,7 @@ impl CompactionListener {
 #[async_trait]
 impl EventListener for CompactionListener {
-    async fn on_merge_ssts_finished(&self, region_id: RegionId) {
+    async fn on_handle_compaction_finished(&self, region_id: RegionId) {
        info!("Handle compaction finished request, region {region_id}");
        self.handle_finished_notify.notify_one();
--- a/src/mito2/src/engine/open_test.rs
+++ b/src/mito2/src/engine/open_test.rs
@@ -127,7 +127,7 @@ async fn test_engine_open_readonly() {
        )
        .await
        .unwrap_err();
-    assert_eq!(StatusCode::RegionNotReady, err.status_code());
+    assert_eq!(StatusCode::RegionReadonly, err.status_code());
    assert_eq!(Some(RegionRole::Follower), engine.role(region_id));
    // Set writable and write.
--- a/src/mito2/src/engine/set_readonly_test.rs
+++ b/src/mito2/src/engine/set_readonly_test.rs
@@ -66,7 +66,7 @@ async fn test_set_readonly_gracefully() {
        .await
        .unwrap_err();
-    assert_eq!(error.status_code(), StatusCode::RegionNotReady);
+    assert_eq!(error.status_code(), StatusCode::RegionReadonly);
    engine.set_writable(region_id, true).unwrap();
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -29,7 +29,6 @@ use store_api::manifest::ManifestVersion;
 use store_api::storage::RegionId;
 use crate::cache::file_cache::FileType;
 use crate::region::RegionState;
 use crate::sst::file::FileId;
 use crate::worker::WorkerId;
@@ -396,11 +395,9 @@ pub enum Error {
        location: Location,
    },
-    #[snafu(display("Region {} is in {:?} state, expect: {:?}", region_id, state, expect))]
+    #[snafu(display("Region {} is read only", region_id))]
-    RegionState {
+    RegionReadonly {
        region_id: RegionId,
        state: RegionState,
        expect: RegionState,
        location: Location,
    },
@@ -672,7 +669,7 @@ impl ErrorExt for Error {
            CompactRegion { source, .. } => source.status_code(),
            CompatReader { .. } => StatusCode::Unexpected,
            InvalidRegionRequest { source, .. } => source.status_code(),
-            RegionState { .. } => StatusCode::RegionNotReady,
+            RegionReadonly { .. } => StatusCode::RegionReadonly,
            JsonOptions { .. } => StatusCode::InvalidArguments,
            EmptyRegionDir { .. } | EmptyManifestDir { .. } => StatusCode::RegionNotFound,
            ArrowReader { .. } => StatusCode::StorageUnavailable,
--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -31,12 +31,10 @@ use crate::config::MitoConfig;
 use crate::error::{
    Error, FlushRegionSnafu, RegionClosedSnafu, RegionDroppedSnafu, RegionTruncatedSnafu, Result,
 };
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::metrics::{FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_ERRORS_TOTAL, FLUSH_REQUESTS_TOTAL};
 use crate::read::Source;
 use crate::region::options::IndexOptions;
-use crate::region::version::{VersionControlData, VersionControlRef};
+use crate::region::version::{VersionControlData, VersionControlRef, VersionRef};
 use crate::region::{ManifestContextRef, RegionState};
 use crate::request::{
    BackgroundNotify, FlushFailed, FlushFinished, OptionOutputTx, OutputTx, SenderDdlRequest,
    SenderWriteRequest, WorkerRequest,
@@ -206,7 +204,6 @@ pub(crate) struct RegionFlushTask {
    pub(crate) engine_config: Arc<MitoConfig>,
    pub(crate) row_group_size: Option<usize>,
    pub(crate) cache_manager: CacheManagerRef,
    pub(crate) manifest_ctx: ManifestContextRef,
    /// Index options for the region.
    pub(crate) index_options: IndexOptions,
@@ -243,30 +240,36 @@ impl RegionFlushTask {
        // Get a version of this region before creating a job to get current
        // wal entry id, sequence and immutable memtables.
        let version_data = version_control.current();
        // This is used to update the version.
        let version_control = version_control.clone();
        Box::pin(async move {
-            self.do_flush(version_data, &version_control).await;
+            self.do_flush(version_data).await;
        })
    }
    /// Runs the flush task.
-    async fn do_flush(
+    async fn do_flush(&mut self, version_data: VersionControlData) {
        &mut self,
        version_data: VersionControlData,
        version_control: &VersionControlRef,
    ) {
        let timer = FLUSH_ELAPSED.with_label_values(&["total"]).start_timer();
        self.listener.on_flush_begin(self.region_id).await;
-        let worker_request = match self.flush_memtables(&version_data, version_control).await {
+        let worker_request = match self.flush_memtables(&version_data.version).await {
-            Ok(()) => {
+            Ok(file_metas) => {
                let memtables_to_remove = version_data
                    .version
                    .memtables
                    .immutables()
                    .iter()
                    .map(|m| m.id())
                    .collect();
                let flush_finished = FlushFinished {
                    region_id: self.region_id,
                    file_metas,
                    // The last entry has been flushed.
                    flushed_entry_id: version_data.last_entry_id,
                    flushed_sequence: version_data.committed_sequence,
                    memtables_to_remove,
                    senders: std::mem::take(&mut self.senders),
                    file_purger: self.file_purger.clone(),
                    _timer: timer,
                };
                WorkerRequest::Background {
@@ -290,13 +293,8 @@ impl RegionFlushTask {
        self.send_worker_request(worker_request).await;
    }
-    /// Flushes memtables to level 0 SSTs and updates the manifest.
+    /// Flushes memtables to level 0 SSTs.
-    async fn flush_memtables(
+    async fn flush_memtables(&self, version: &VersionRef) -> Result<Vec<FileMeta>> {
        &self,
        version_data: &VersionControlData,
        version_control: &VersionControlRef,
    ) -> Result<()> {
        let version = &version_data.version;
        let timer = FLUSH_ELAPSED
            .with_label_values(&["flush_memtables"])
            .start_timer();
@@ -384,31 +382,7 @@ impl RegionFlushTask {
            timer.stop_and_record(),
        );
-        let memtables_to_remove: SmallVec<[_; 2]> = version_data
+        Ok(file_metas)
            .version
            .memtables
            .immutables()
            .iter()
            .map(|m| m.id())
            .collect();
        let edit = RegionEdit {
            files_to_add: file_metas,
            files_to_remove: Vec::new(),
            compaction_time_window: None,
            // The last entry has been flushed.
            flushed_entry_id: Some(version_data.last_entry_id),
            flushed_sequence: Some(version_data.committed_sequence),
        };
        info!("Applying {edit:?} to region {}", self.region_id);
        let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
        // We will leak files if the manifest update fails, but we ignore them for simplicity. We can
        // add a cleanup job to remove them later.
        self.manifest_ctx
            .update_manifest(RegionState::Writable, action_list, || {
                version_control.apply_edit(edit, &memtables_to_remove, self.file_purger.clone());
            })
            .await
    }
    /// Notify flush job status.
@@ -801,9 +775,6 @@ mod tests {
            engine_config: Arc::new(MitoConfig::default()),
            row_group_size: None,
            cache_manager: Arc::new(CacheManager::default()),
            manifest_ctx: env
                .mock_manifest_context(version_control.current().version.metadata.clone())
                .await,
            index_options: IndexOptions::default(),
        };
        task.push_sender(OptionOutputTx::from(output_tx));
--- a/src/mito2/src/manifest/manager.rs
+++ b/src/mito2/src/manifest/manager.rs
@@ -257,8 +257,9 @@ impl RegionManifestManager {
    }
    /// Stops the manager.
-    pub async fn stop(&mut self) {
+    pub async fn stop(&mut self) -> Result<()> {
        self.stopped = true;
        Ok(())
    }
    /// Updates the manifest. Returns the current manifest version number.
@@ -523,7 +524,7 @@ mod test {
            .unwrap()
            .unwrap();
        // Stops it.
-        manager.stop().await;
+        manager.stop().await.unwrap();
        // Open it.
        let manager = env
@@ -563,7 +564,7 @@ mod test {
        manager.validate_manifest(&new_metadata, 1);
        // Reopen the manager.
-        manager.stop().await;
+        manager.stop().await.unwrap();
        let manager = env
            .create_manifest_manager(CompressionType::Uncompressed, 10, None)
            .await
@@ -650,7 +651,7 @@ mod test {
        // Reopen the manager,
        // we just calculate the size from the latest checkpoint file
-        manager.stop().await;
+        manager.stop().await.unwrap();
        let manager = env
            .create_manifest_manager(CompressionType::Uncompressed, 10, None)
            .await
--- a/src/mito2/src/manifest/tests/checkpoint.rs
+++ b/src/mito2/src/manifest/tests/checkpoint.rs
@@ -152,7 +152,7 @@ async fn manager_with_checkpoint_distance_1() {
    assert_eq!(expected_json, raw_json);
    // reopen the manager
-    manager.stop().await;
+    manager.stop().await.unwrap();
    let manager = reopen_manager(&env, 1, CompressionType::Uncompressed).await;
    assert_eq!(10, manager.manifest().manifest_version);
 }
--- a/src/mito2/src/read.rs
+++ b/src/mito2/src/read.rs
@@ -382,17 +382,17 @@ impl Batch {
        self.take_in_place(&indices)
    }
-    /// Returns ids and datatypes of fields in the [Batch] after applying the `projection`.
+    /// Returns ids of fields in the [Batch] after applying the `projection`.
    pub(crate) fn projected_fields(
        metadata: &RegionMetadata,
        projection: &[ColumnId],
-    ) -> Vec<(ColumnId, ConcreteDataType)> {
+    ) -> Vec<ColumnId> {
        let projected_ids: HashSet<_> = projection.iter().copied().collect();
        metadata
            .field_columns()
            .filter_map(|column| {
                if projected_ids.contains(&column.column_id) {
-                    Some((column.column_id, column.column_schema.data_type.clone()))
+                    Some(column.column_id)
                } else {
                    None
                }
--- a/src/mito2/src/read/compat.rs
+++ b/src/mito2/src/read/compat.rs
@@ -16,7 +16,6 @@
 use std::collections::HashMap;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::value::Value;
 use datatypes::vectors::VectorRef;
 use snafu::{ensure, OptionExt, ResultExt};
@@ -86,7 +85,7 @@ pub(crate) fn has_same_columns(left: &RegionMetadata, right: &RegionMetadata) ->
    }
    for (left_col, right_col) in left.column_metadatas.iter().zip(&right.column_metadatas) {
-        if left_col.column_id != right_col.column_id || !left_col.is_same_datatype(right_col) {
+        if left_col.column_id != right_col.column_id {
            return false;
        }
        debug_assert_eq!(
@@ -135,8 +134,8 @@ impl CompatPrimaryKey {
 /// Helper to make fields compatible.
 #[derive(Debug)]
 struct CompatFields {
-    /// Column Ids and DataTypes the reader actually returns.
+    /// Column Ids the reader actually returns.
-    actual_fields: Vec<(ColumnId, ConcreteDataType)>,
+    actual_fields: Vec<ColumnId>,
    /// Indices to convert actual fields to expect fields.
    index_or_defaults: Vec<IndexOrDefault>,
 }
@@ -150,28 +149,14 @@ impl CompatFields {
            .actual_fields
            .iter()
            .zip(batch.fields())
-            .all(|((id, _), batch_column)| *id == batch_column.column_id));
+            .all(|(id, batch_column)| *id == batch_column.column_id));
        let len = batch.num_rows();
        let fields = self
            .index_or_defaults
            .iter()
            .map(|index_or_default| match index_or_default {
-                IndexOrDefault::Index { pos, cast_type } => {
+                IndexOrDefault::Index(index) => batch.fields()[*index].clone(),
                    let old_column = &batch.fields()[*pos];
                    let data = if let Some(ty) = cast_type {
                        // Safety: We ensure type can be converted and the new batch should be valid.
                        // Tips: `safe` must be true in `CastOptions`, which will replace the specific value with null when it cannot be converted.
                        old_column.data.cast(ty).unwrap()
                    } else {
                        old_column.data.clone()
                    };
                    BatchColumn {
                        column_id: old_column.column_id,
                        data,
                    }
                }
                IndexOrDefault::DefaultValue {
                    column_id,
                    default_vector,
@@ -263,23 +248,15 @@ fn may_compat_fields(
    let source_field_index: HashMap<_, _> = actual_fields
        .iter()
        .enumerate()
-        .map(|(idx, (column_id, data_type))| (*column_id, (idx, data_type)))
+        .map(|(idx, column_id)| (*column_id, idx))
        .collect();
    let index_or_defaults = expect_fields
        .iter()
-        .map(|(column_id, expect_data_type)| {
+        .map(|column_id| {
-            if let Some((index, actual_data_type)) = source_field_index.get(column_id) {
+            if let Some(index) = source_field_index.get(column_id) {
                let mut cast_type = None;
                if expect_data_type != *actual_data_type {
                    cast_type = Some(expect_data_type.clone())
                }
                // Source has this field.
-                Ok(IndexOrDefault::Index {
+                Ok(IndexOrDefault::Index(*index))
                    pos: *index,
                    cast_type,
                })
            } else {
                // Safety: mapper must have this column.
                let column = mapper.metadata().column_by_id(*column_id).unwrap();
@@ -316,10 +293,7 @@ fn may_compat_fields(
 #[derive(Debug)]
 enum IndexOrDefault {
    /// Index of the column in source batch.
-    Index {
+    Index(usize),
        pos: usize,
        cast_type: Option<ConcreteDataType>,
    },
    /// Default value for the column.
    DefaultValue {
        /// Id of the column.
@@ -346,19 +320,27 @@ mod tests {
    /// Creates a new [RegionMetadata].
    fn new_metadata(
-        semantic_types: &[(ColumnId, SemanticType, ConcreteDataType)],
+        semantic_types: &[(ColumnId, SemanticType)],
        primary_key: &[ColumnId],
    ) -> RegionMetadata {
        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
-        for (id, semantic_type, data_type) in semantic_types {
+        for (id, semantic_type) in semantic_types {
            let column_schema = match semantic_type {
-                SemanticType::Tag => {
+                SemanticType::Tag => ColumnSchema::new(
-                    ColumnSchema::new(format!("tag_{id}"), data_type.clone(), true)
+                    format!("tag_{id}"),
-                }
+                    ConcreteDataType::string_datatype(),
-                SemanticType::Field => {
+                    true,
-                    ColumnSchema::new(format!("field_{id}"), data_type.clone(), true)
+                ),
-                }
+                SemanticType::Field => ColumnSchema::new(
-                SemanticType::Timestamp => ColumnSchema::new("ts", data_type.clone(), false),
+                    format!("field_{id}"),
                    ConcreteDataType::int64_datatype(),
                    true,
                ),
                SemanticType::Timestamp => ColumnSchema::new(
                    "ts",
                    ConcreteDataType::timestamp_millisecond_datatype(),
                    false,
                ),
            };
            builder.push_column_metadata(ColumnMetadata {
@@ -427,26 +409,18 @@ mod tests {
    fn test_invalid_pk_len() {
        let reader_meta = new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Tag),
-                    ConcreteDataType::timestamp_millisecond_datatype(),
+                (3, SemanticType::Field),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1, 2],
        );
        let expect_meta = new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Field),
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        );
@@ -457,28 +431,20 @@ mod tests {
    fn test_different_pk() {
        let reader_meta = new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Tag),
-                    ConcreteDataType::timestamp_millisecond_datatype(),
+                (3, SemanticType::Field),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[2, 1],
        );
        let expect_meta = new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Tag),
-                    ConcreteDataType::timestamp_millisecond_datatype(),
+                (3, SemanticType::Field),
-                ),
+                (4, SemanticType::Tag),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
                (4, SemanticType::Tag, ConcreteDataType::string_datatype()),
            ],
            &[1, 2, 4],
        );
@@ -489,13 +455,9 @@ mod tests {
    fn test_same_pk() {
        let reader_meta = new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Field),
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        );
@@ -508,13 +470,9 @@ mod tests {
    fn test_same_fields() {
        let reader_meta = Arc::new(new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Field),
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        ));
@@ -526,27 +484,19 @@ mod tests {
    async fn test_compat_reader() {
        let reader_meta = Arc::new(new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Field),
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        ));
        let expect_meta = Arc::new(new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Field),
-                    ConcreteDataType::timestamp_millisecond_datatype(),
+                (3, SemanticType::Tag),
-                ),
+                (4, SemanticType::Field),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
                (3, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (4, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1, 3],
        ));
@@ -575,27 +525,19 @@ mod tests {
    async fn test_compat_reader_different_order() {
        let reader_meta = Arc::new(new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Field),
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        ));
        let expect_meta = Arc::new(new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (3, SemanticType::Field),
-                    ConcreteDataType::timestamp_millisecond_datatype(),
+                (2, SemanticType::Field),
-                ),
+                (4, SemanticType::Field),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
                (4, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        ));
@@ -618,85 +560,23 @@ mod tests {
        .await;
    }
    #[tokio::test]
    async fn test_compat_reader_different_types() {
        let actual_meta = Arc::new(new_metadata(
            &[
                (
                    0,
                    SemanticType::Timestamp,
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        ));
        let expect_meta = Arc::new(new_metadata(
            &[
                (
                    0,
                    SemanticType::Timestamp,
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::string_datatype()),
            ],
            &[1],
        ));
        let mapper = ProjectionMapper::all(&expect_meta).unwrap();
        let k1 = encode_key(&[Some("a")]);
        let k2 = encode_key(&[Some("b")]);
        let source_reader = VecBatchReader::new(&[
            new_batch(&k1, &[(2, false)], 1000, 3),
            new_batch(&k2, &[(2, false)], 1000, 3),
        ]);
        let fn_batch_cast = |batch: Batch| {
            let mut new_fields = batch.fields.clone();
            new_fields[0].data = new_fields[0]
                .data
                .cast(&ConcreteDataType::string_datatype())
                .unwrap();
            batch.with_fields(new_fields).unwrap()
        };
        let mut compat_reader = CompatReader::new(&mapper, actual_meta, source_reader).unwrap();
        check_reader_result(
            &mut compat_reader,
            &[
                fn_batch_cast(new_batch(&k1, &[(2, false)], 1000, 3)),
                fn_batch_cast(new_batch(&k2, &[(2, false)], 1000, 3)),
            ],
        )
        .await;
    }
    #[tokio::test]
    async fn test_compat_reader_projection() {
        let reader_meta = Arc::new(new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (2, SemanticType::Field),
                    ConcreteDataType::timestamp_millisecond_datatype(),
                ),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        ));
        let expect_meta = Arc::new(new_metadata(
            &[
-                (
+                (0, SemanticType::Timestamp),
-                    0,
+                (1, SemanticType::Tag),
-                    SemanticType::Timestamp,
+                (3, SemanticType::Field),
-                    ConcreteDataType::timestamp_millisecond_datatype(),
+                (2, SemanticType::Field),
-                ),
+                (4, SemanticType::Field),
                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
                (4, SemanticType::Field, ConcreteDataType::int64_datatype()),
            ],
            &[1],
        ));
--- a/src/mito2/src/read/projection.rs
+++ b/src/mito2/src/read/projection.rs
@@ -53,8 +53,8 @@ pub struct ProjectionMapper {
    /// Ids of columns to project. It keeps ids in the same order as the `projection`
    /// indices to build the mapper.
    column_ids: Vec<ColumnId>,
-    /// Ids and DataTypes of field columns in the [Batch].
+    /// Ids of field columns in the [Batch].
-    batch_fields: Vec<(ColumnId, ConcreteDataType)>,
+    batch_fields: Vec<ColumnId>,
 }
 impl ProjectionMapper {
@@ -95,7 +95,7 @@ impl ProjectionMapper {
        let field_id_to_index: HashMap<_, _> = batch_fields
            .iter()
            .enumerate()
-            .map(|(index, (column_id, _))| (*column_id, index))
+            .map(|(index, column_id)| (*column_id, index))
            .collect();
        // For each projected column, compute its index in batches.
        let mut batch_indices = Vec::with_capacity(projection.len());
@@ -151,7 +151,7 @@ impl ProjectionMapper {
    }
    /// Returns ids of fields in [Batch]es the mapper expects to convert.
-    pub(crate) fn batch_fields(&self) -> &[(ColumnId, ConcreteDataType)] {
+    pub(crate) fn batch_fields(&self) -> &[ColumnId] {
        &self.batch_fields
    }
@@ -173,7 +173,7 @@ impl ProjectionMapper {
            .batch_fields
            .iter()
            .zip(batch.fields())
-            .all(|((id, _), batch_col)| *id == batch_col.column_id));
+            .all(|(id, batch_col)| *id == batch_col.column_id));
        // Skips decoding pk if we don't need to output it.
        let pk_values = if self.has_tags {
@@ -344,13 +344,7 @@ mod tests {
        );
        let mapper = ProjectionMapper::all(&metadata).unwrap();
        assert_eq!([0, 1, 2, 3, 4], mapper.column_ids());
-        assert_eq!(
+        assert_eq!([3, 4], mapper.batch_fields());
            [
                (3, ConcreteDataType::int64_datatype()),
                (4, ConcreteDataType::int64_datatype())
            ],
            mapper.batch_fields()
        );
        // With vector cache.
        let cache = CacheManager::builder().vector_cache_size(1024).build();
@@ -384,10 +378,7 @@ mod tests {
        // Columns v1, k0
        let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter()).unwrap();
        assert_eq!([4, 1], mapper.column_ids());
-        assert_eq!(
+        assert_eq!([4], mapper.batch_fields());
            [(4, ConcreteDataType::int64_datatype())],
            mapper.batch_fields()
        );
        let batch = new_batch(0, &[1, 2], &[(4, 4)], 3);
        let record_batch = mapper.convert(&batch, None).unwrap();
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -19,21 +19,21 @@ pub mod options;
 pub(crate) mod version;
 use std::collections::HashMap;
-use std::sync::atomic::{AtomicI64, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicI64, Ordering};
 use std::sync::{Arc, RwLock};
-use common_telemetry::{error, info, warn};
+use common_telemetry::info;
 use common_wal::options::WalOptions;
 use crossbeam_utils::atomic::AtomicCell;
 use snafu::{ensure, OptionExt};
 use store_api::metadata::RegionMetadataRef;
 use store_api::storage::RegionId;
 use tokio::sync::RwLock as TokioRwLock;
 use crate::access_layer::AccessLayerRef;
-use crate::error::{RegionNotFoundSnafu, RegionStateSnafu, RegionTruncatedSnafu, Result};
+use crate::error::{RegionNotFoundSnafu, RegionReadonlySnafu, Result};
-use crate::manifest::action::{RegionMetaAction, RegionMetaActionList};
+use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::manifest::manager::RegionManifestManager;
-use crate::memtable::MemtableBuilderRef;
+use crate::memtable::{MemtableBuilderRef, MemtableId};
 use crate::region::version::{VersionControlRef, VersionRef};
 use crate::request::OnFailure;
 use crate::sst::file_purger::FilePurgerRef;
@@ -57,23 +57,6 @@ impl RegionUsage {
    }
 }
 /// State of the region.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum RegionState {
    /// The region is opened but is still read-only.
    ReadOnly,
    /// The region is opened and is writable.
    Writable,
    /// The region is altering.
    Altering,
    /// The region is dropping.
    Dropping,
    /// The region is truncating.
    Truncating,
    /// The region is handling a region edit.
    Editing,
 }
 /// Metadata and runtime status of a region.
 ///
 /// Writing and reading a region follow a single-writer-multi-reader rule:
@@ -88,19 +71,19 @@ pub(crate) struct MitoRegion {
    pub(crate) region_id: RegionId,
    /// Version controller for this region.
    ///
    /// We MUST update the version control inside the write lock of the region manifest manager.
    pub(crate) version_control: VersionControlRef,
    /// SSTs accessor for this region.
    pub(crate) access_layer: AccessLayerRef,
-    /// Context to maintain manifest for this region.
+    /// Manager to maintain manifest for this region.
-    pub(crate) manifest_ctx: ManifestContextRef,
+    pub(crate) manifest_manager: TokioRwLock<RegionManifestManager>,
    /// SST file purger.
    pub(crate) file_purger: FilePurgerRef,
    /// Wal options of this region.
    pub(crate) wal_options: WalOptions,
    /// Last flush time in millis.
    last_flush_millis: AtomicI64,
    /// Whether the region is writable.
    writable: AtomicBool,
    /// Provider to get current time.
    time_provider: TimeProviderRef,
    /// Memtable builder for the region.
@@ -111,18 +94,15 @@ pub(crate) type MitoRegionRef = Arc<MitoRegion>;
 impl MitoRegion {
    /// Stop background managers for this region.
-    pub(crate) async fn stop(&self) {
+    pub(crate) async fn stop(&self) -> Result<()> {
-        self.manifest_ctx
+        self.manifest_manager.write().await.stop().await?;
            .manifest_manager
            .write()
            .await
            .stop()
            .await;
        info!(
            "Stopped region manifest manager, region_id: {}",
            self.region_id
        );
        Ok(())
    }
    /// Returns current metadata of the region.
@@ -148,73 +128,19 @@ impl MitoRegion {
        self.last_flush_millis.store(now, Ordering::Relaxed);
    }
    /// Returns whether the region is writable.
    pub(crate) fn is_writable(&self) -> bool {
        self.writable.load(Ordering::Relaxed)
    }
    /// Returns the region dir.
    pub(crate) fn region_dir(&self) -> &str {
        self.access_layer.region_dir()
    }
-    /// Returns whether the region is writable.
+    /// Sets the writable flag.
    pub(crate) fn is_writable(&self) -> bool {
        self.manifest_ctx.state.load() == RegionState::Writable
    }
    /// Returns the state of the region.
    pub(crate) fn state(&self) -> RegionState {
        self.manifest_ctx.state.load()
    }
    /// Sets the writable state.
    pub(crate) fn set_writable(&self, writable: bool) {
-        if writable {
+        self.writable.store(writable, Ordering::Relaxed);
            // Only sets the region to writable if it is read only.
            // This prevents others updating the manifest.
            let _ = self
                .manifest_ctx
                .state
                .compare_exchange(RegionState::ReadOnly, RegionState::Writable);
        } else {
            self.manifest_ctx.state.store(RegionState::ReadOnly);
        }
    }
    /// Sets the altering state.
    /// You should call this method in the worker loop.
    pub(crate) fn set_altering(&self) -> Result<()> {
        self.compare_exchange_state(RegionState::Writable, RegionState::Altering)
    }
    /// Sets the dropping state.
    /// You should call this method in the worker loop.
    pub(crate) fn set_dropping(&self) -> Result<()> {
        self.compare_exchange_state(RegionState::Writable, RegionState::Dropping)
    }
    /// Sets the truncating state.
    /// You should call this method in the worker loop.
    pub(crate) fn set_truncating(&self) -> Result<()> {
        self.compare_exchange_state(RegionState::Writable, RegionState::Truncating)
    }
    /// Sets the editing state.
    /// You should call this method in the worker loop.
    pub(crate) fn set_editing(&self) -> Result<()> {
        self.compare_exchange_state(RegionState::Writable, RegionState::Editing)
    }
    /// Sets the region to readonly gracefully. This acquires the manifest write lock.
    pub(crate) async fn set_readonly_gracefully(&self) {
        let _manager = self.manifest_ctx.manifest_manager.write().await;
        // We acquires the write lock of the manifest manager to ensure that no one is updating the manifest.
        // Then we change the state.
        self.set_writable(false);
    }
    /// Switches the region state to `RegionState::Writable` if the current state is `expect`.
    /// Otherwise, logs an error.
    pub(crate) fn switch_state_to_writable(&self, expect: RegionState) {
        if let Err(e) = self.compare_exchange_state(expect, RegionState::Writable) {
            error!(e; "failed to switch region state to writable, expect state is {:?}", expect);
        }
    }
    /// Returns the region usage in bytes.
@@ -229,12 +155,7 @@ impl MitoRegion {
        let wal_usage = self.estimated_wal_usage(memtable_usage);
-        let manifest_usage = self
+        let manifest_usage = self.manifest_manager.read().await.manifest_usage();
            .manifest_ctx
            .manifest_manager
            .read()
            .await
            .manifest_usage();
        RegionUsage {
            region_id,
@@ -250,133 +171,28 @@ impl MitoRegion {
        ((memtable_usage as f32) * ESTIMATED_WAL_FACTOR) as u64
    }
-    /// Sets the state of the region to given state if the current state equals to
+    pub(crate) async fn apply_edit(
    /// the expected.
    fn compare_exchange_state(&self, expect: RegionState, state: RegionState) -> Result<()> {
        self.manifest_ctx
            .state
            .compare_exchange(expect, state)
            .map_err(|actual| {
                RegionStateSnafu {
                    region_id: self.region_id,
                    state: actual,
                    expect,
                }
                .build()
            })?;
        Ok(())
    }
 }
 /// Context to update the region manifest.
 #[derive(Debug)]
 pub(crate) struct ManifestContext {
    /// Manager to maintain manifest for this region.
    manifest_manager: tokio::sync::RwLock<RegionManifestManager>,
    /// The state of the region. The region checks the state before updating
    /// manifest.
    state: AtomicCell<RegionState>,
 }
 impl ManifestContext {
    pub(crate) fn new(manager: RegionManifestManager, state: RegionState) -> Self {
        ManifestContext {
            manifest_manager: tokio::sync::RwLock::new(manager),
            state: AtomicCell::new(state),
        }
    }
    pub(crate) async fn has_update(&self) -> Result<bool> {
        self.manifest_manager.read().await.has_update().await
    }
    /// Updates the manifest if current state is `expect_state` and executes
    /// the `applier` if the manifest is updated.
    pub(crate) async fn update_manifest(
        &self,
-        expect_state: RegionState,
+        edit: RegionEdit,
-        action_list: RegionMetaActionList,
+        memtables_to_remove: &[MemtableId],
        applier: impl FnOnce(),
    ) -> Result<()> {
-        // Acquires the write lock of the manifest manager.
+        info!("Applying {edit:?} to region {}", self.region_id);
        let mut manager = self.manifest_manager.write().await;
        // Gets current manifest.
        let manifest = manager.manifest();
        // Checks state inside the lock. This is to ensure that we won't update the manifest
        // after `set_readonly_gracefully()` is called.
        let current_state = self.state.load();
        ensure!(
            current_state == expect_state,
            RegionStateSnafu {
                region_id: manifest.metadata.region_id,
                state: current_state,
                expect: expect_state,
            }
        );
-        for action in &action_list.actions {
+        self.manifest_manager
-            // Checks whether the edit is still applicable.
+            .write()
-            let RegionMetaAction::Edit(edit) = &action else {
+            .await
-                continue;
+            .update(RegionMetaActionList::with_action(RegionMetaAction::Edit(
-            };
+                edit.clone(),
-
+            )))
-            // Checks whether the region is truncated.
+            .await?;
            let Some(truncated_entry_id) = manifest.truncated_entry_id else {
                continue;
            };
            // This is an edit from flush.
            if let Some(flushed_entry_id) = edit.flushed_entry_id {
                ensure!(
                    truncated_entry_id < flushed_entry_id,
                    RegionTruncatedSnafu {
                        region_id: manifest.metadata.region_id,
                    }
                );
            }
            // This is an edit from compaction.
            if !edit.files_to_remove.is_empty() {
                // Input files of the compaction task has been truncated.
                for file in &edit.files_to_remove {
                    ensure!(
                        manifest.files.contains_key(&file.file_id),
                        RegionTruncatedSnafu {
                            region_id: manifest.metadata.region_id,
                        }
                    );
                }
            }
        }
        // Now we can update the manifest.
        manager.update(action_list).await.inspect_err(
            |e| error!(e; "Failed to update manifest, region_id: {}", manifest.metadata.region_id),
        )?;
        // Executes the applier. We MUST hold the write lock.
        applier();
        if self.state.load() == RegionState::ReadOnly {
            warn!(
                "Region {} becomes read-only while updating manifest which may cause inconsistency",
                manifest.metadata.region_id
            );
        }
        // Apply edit to region's version.
        self.version_control
            .apply_edit(edit, memtables_to_remove, self.file_purger.clone());
        Ok(())
    }
 }
 #[cfg(test)]
 impl ManifestContext {
    pub(crate) async fn manifest(&self) -> Arc<crate::manifest::action::RegionManifest> {
        self.manifest_manager.read().await.manifest()
    }
 }
 pub(crate) type ManifestContextRef = Arc<ManifestContext>;
 /// Regions indexed by ids.
 #[derive(Debug, Default)]
 pub(crate) struct RegionMap {
@@ -409,14 +225,7 @@ impl RegionMap {
        let region = self
            .get_region(region_id)
            .context(RegionNotFoundSnafu { region_id })?;
-        ensure!(
+        ensure!(region.is_writable(), RegionReadonlySnafu { region_id });
            region.is_writable(),
            RegionStateSnafu {
                region_id,
                state: region.state(),
                expect: RegionState::Writable,
            }
        );
        Ok(region)
    }
@@ -456,15 +265,3 @@ impl RegionMap {
 }
 pub(crate) type RegionMapRef = Arc<RegionMap>;
 #[cfg(test)]
 mod tests {
    use crossbeam_utils::atomic::AtomicCell;
    use crate::region::RegionState;
    #[test]
    fn test_region_state_lock_free() {
        assert!(AtomicCell::<RegionState>::is_lock_free());
    }
 }
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -15,7 +15,7 @@
 //! Region opener.
 use std::collections::HashMap;
-use std::sync::atomic::AtomicI64;
+use std::sync::atomic::{AtomicBool, AtomicI64};
 use std::sync::Arc;
 use common_telemetry::{debug, error, info, warn};
@@ -27,6 +27,7 @@ use snafu::{ensure, OptionExt};
 use store_api::logstore::LogStore;
 use store_api::metadata::{ColumnMetadata, RegionMetadata};
 use store_api::storage::{ColumnId, RegionId};
 use tokio::sync::RwLock;
 use crate::access_layer::AccessLayer;
 use crate::cache::CacheManagerRef;
@@ -40,7 +41,7 @@ use crate::memtable::time_partition::TimePartitions;
 use crate::memtable::MemtableBuilderProvider;
 use crate::region::options::RegionOptions;
 use crate::region::version::{VersionBuilder, VersionControl, VersionControlRef};
-use crate::region::{ManifestContext, MitoRegion, RegionState};
+use crate::region::MitoRegion;
 use crate::region_write_ctx::RegionWriteCtx;
 use crate::request::OptionOutputTx;
 use crate::schedule::scheduler::SchedulerRef;
@@ -202,11 +203,7 @@ impl RegionOpener {
            region_id,
            version_control,
            access_layer: access_layer.clone(),
-            // Region is writable after it is created.
+            manifest_manager: RwLock::new(manifest_manager),
            manifest_ctx: Arc::new(ManifestContext::new(
                manifest_manager,
                RegionState::Writable,
            )),
            file_purger: Arc::new(LocalFilePurger::new(
                self.purge_scheduler,
                access_layer,
@@ -214,6 +211,8 @@ impl RegionOpener {
            )),
            wal_options,
            last_flush_millis: AtomicI64::new(time_provider.current_time_millis()),
            // Region is writable after it is created.
            writable: AtomicBool::new(true),
            time_provider,
            memtable_builder,
        })
@@ -332,14 +331,12 @@ impl RegionOpener {
            region_id: self.region_id,
            version_control,
            access_layer,
-            // Region is always opened in read only mode.
+            manifest_manager: RwLock::new(manifest_manager),
            manifest_ctx: Arc::new(ManifestContext::new(
                manifest_manager,
                RegionState::ReadOnly,
            )),
            file_purger,
            wal_options,
            last_flush_millis: AtomicI64::new(time_provider.current_time_millis()),
            // Region is always opened in read only mode.
            writable: AtomicBool::new(false),
            time_provider,
            memtable_builder,
        };
@@ -441,11 +438,13 @@ pub(crate) async fn replay_memtable<S: LogStore>(
    // data in the WAL.
    let mut last_entry_id = flushed_entry_id;
    let replay_from_entry_id = flushed_entry_id + 1;
    let mut stale_entry_found = false;
    let mut wal_stream = wal.scan(region_id, replay_from_entry_id, wal_options)?;
    while let Some(res) = wal_stream.next().await {
        let (entry_id, entry) = res?;
        if entry_id <= flushed_entry_id {
            stale_entry_found = true;
            warn!("Stale WAL entries read during replay, region id: {}, flushed entry id: {}, entry id read: {}", region_id, flushed_entry_id, entry_id);
            ensure!(
                allow_stale_entries,
@@ -474,8 +473,11 @@ pub(crate) async fn replay_memtable<S: LogStore>(
        region_write_ctx.write_memtable();
    }
    if allow_stale_entries && stale_entry_found {
        wal.obsolete(region_id, flushed_entry_id, wal_options)
            .await?;
        info!("Force obsolete WAL entries, region id: {}, flushed entry id: {}, last entry id read: {}", region_id, flushed_entry_id, last_entry_id);
    }
    info!(
        "Replay WAL for region: {}, rows recovered: {}, last entry id: {}",
--- a/src/mito2/src/request.rs
+++ b/src/mito2/src/request.rs
@@ -16,17 +16,18 @@
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 use api::helper::{
    is_column_type_value_eq, is_semantic_type_eq, proto_value_type, to_proto_value,
    ColumnDataTypeWrapper,
 };
 use api::v1::{ColumnDataType, ColumnSchema, OpType, Rows, SemanticType, Value};
-use common_telemetry::info;
+use common_telemetry::{info, warn};
 use datatypes::prelude::DataType;
 use prometheus::HistogramTimer;
 use prost::Message;
 use smallvec::SmallVec;
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::metadata::{ColumnMetadata, RegionMetadata};
 use store_api::region_engine::SetReadonlyResponse;
@@ -43,7 +44,10 @@ use crate::error::{
    FlushRegionSnafu, InvalidRequestSnafu, Result,
 };
 use crate::manifest::action::RegionEdit;
 use crate::memtable::MemtableId;
 use crate::metrics::COMPACTION_ELAPSED_TOTAL;
 use crate::sst::file::FileMeta;
 use crate::sst::file_purger::{FilePurgerRef, PurgeRequest};
 use crate::wal::EntryId;
 /// Request to write a region.
@@ -616,8 +620,6 @@ pub(crate) enum BackgroundNotify {
    CompactionFinished(CompactionFinished),
    /// Compaction has failed.
    CompactionFailed(CompactionFailed),
    /// Truncate result.
    Truncate(TruncateResult),
 }
 /// Notifies a flush job is finished.
@@ -625,10 +627,18 @@ pub(crate) enum BackgroundNotify {
 pub(crate) struct FlushFinished {
    /// Region id.
    pub(crate) region_id: RegionId,
    /// Meta of the flushed SSTs.
    pub(crate) file_metas: Vec<FileMeta>,
    /// Entry id of flushed data.
    pub(crate) flushed_entry_id: EntryId,
    /// Sequence of flushed data.
    pub(crate) flushed_sequence: SequenceNumber,
    /// Id of memtables to remove.
    pub(crate) memtables_to_remove: SmallVec<[MemtableId; 2]>,
    /// Flush result senders.
    pub(crate) senders: Vec<OutputTx>,
    /// File purger for cleaning files on failure.
    pub(crate) file_purger: FilePurgerRef,
    /// Flush timer.
    pub(crate) _timer: HistogramTimer,
 }
@@ -650,6 +660,12 @@ impl OnFailure for FlushFinished {
                region_id: self.region_id,
            }));
        }
        // Clean flushed files.
        for file in &self.file_metas {
            self.file_purger.send_request(PurgeRequest {
                file_meta: file.clone(),
            });
        }
    }
 }
@@ -665,8 +681,16 @@ pub(crate) struct FlushFailed {
 pub(crate) struct CompactionFinished {
    /// Region id.
    pub(crate) region_id: RegionId,
    /// Compaction output files that are to be added to region version.
    pub(crate) compaction_outputs: Vec<FileMeta>,
    /// Compacted files that are to be removed from region version.
    pub(crate) compacted_files: Vec<FileMeta>,
    /// Compaction result senders.
    pub(crate) senders: Vec<OutputTx>,
    /// File purger for cleaning files on failure.
    pub(crate) file_purger: FilePurgerRef,
    /// Inferred Compaction time window.
    pub(crate) compaction_time_window: Option<Duration>,
    /// Start time of compaction task.
    pub(crate) start_time: Instant,
 }
@@ -684,7 +708,8 @@ impl CompactionFinished {
 }
 impl OnFailure for CompactionFinished {
-    /// Compaction succeeded but failed to update manifest or region's already been dropped.
+    /// Compaction succeeded but failed to update manifest or region's already been dropped,
    /// clean compaction output files.
    fn on_failure(&mut self, err: Error) {
        let err = Arc::new(err);
        for sender in self.senders.drain(..) {
@@ -692,6 +717,15 @@ impl OnFailure for CompactionFinished {
                region_id: self.region_id,
            }));
        }
        for file in &self.compaction_outputs {
            warn!(
                "Cleaning region {} compaction output file: {}",
                self.region_id, file.file_id
            );
            self.file_purger.send_request(PurgeRequest {
                file_meta: file.clone(),
            });
        }
    }
 }
@@ -703,21 +737,6 @@ pub(crate) struct CompactionFailed {
    pub(crate) err: Arc<Error>,
 }
 /// Notifies the truncate result of a region.
 #[derive(Debug)]
 pub(crate) struct TruncateResult {
    /// Region id.
    pub(crate) region_id: RegionId,
    /// Result sender.
    pub(crate) sender: OptionOutputTx,
    /// Truncate result.
    pub(crate) result: Result<()>,
    /// Truncated entry id.
    pub(crate) truncated_entry_id: EntryId,
    /// Truncated sequence.
    pub(crate) truncated_sequence: SequenceNumber,
 }
 #[cfg(test)]
 mod tests {
    use api::v1::value::ValueData;
--- a/src/mito2/src/test_util.rs
+++ b/src/mito2/src/test_util.rs
@@ -396,7 +396,6 @@ pub struct CreateRequestBuilder {
    primary_key: Option<Vec<ColumnId>>,
    all_not_null: bool,
    engine: String,
    ts_type: ConcreteDataType,
 }
 impl Default for CreateRequestBuilder {
@@ -409,7 +408,6 @@ impl Default for CreateRequestBuilder {
            primary_key: None,
            all_not_null: false,
            engine: MITO_ENGINE_NAME.to_string(),
            ts_type: ConcreteDataType::timestamp_millisecond_datatype(),
        }
    }
 }
@@ -456,12 +454,6 @@ impl CreateRequestBuilder {
        self
    }
    #[must_use]
    pub fn with_ts_type(mut self, ty: ConcreteDataType) -> Self {
        self.ts_type = ty;
        self
    }
    pub fn build(&self) -> RegionCreateRequest {
        let mut column_id = 0;
        let mut column_metadatas = Vec::with_capacity(self.tag_num + self.field_num + 1);
@@ -495,7 +487,7 @@ impl CreateRequestBuilder {
        column_metadatas.push(ColumnMetadata {
            column_schema: ColumnSchema::new(
                "ts",
-                self.ts_type.clone(),
+                ConcreteDataType::timestamp_millisecond_datatype(),
                // Time index is always not null.
                false,
            ),
--- a/src/mito2/src/test_util/scheduler_util.rs
+++ b/src/mito2/src/test_util/scheduler_util.rs
@@ -16,25 +16,19 @@
 use std::sync::Arc;
 use common_datasource::compression::CompressionType;
 use common_test_util::temp_dir::{create_temp_dir, TempDir};
 use object_store::services::Fs;
 use object_store::util::join_dir;
 use object_store::ObjectStore;
 use store_api::metadata::RegionMetadataRef;
 use tokio::sync::mpsc::Sender;
 use crate::access_layer::{AccessLayer, AccessLayerRef};
 use crate::cache::CacheManager;
 use crate::compaction::CompactionScheduler;
 use crate::config::MitoConfig;
 use crate::flush::FlushScheduler;
 use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::region::{ManifestContext, ManifestContextRef, RegionState};
 use crate::request::WorkerRequest;
 use crate::schedule::scheduler::{LocalScheduler, SchedulerRef};
 use crate::sst::index::intermediate::IntermediateManager;
 use crate::worker::WorkerListener;
 /// Scheduler mocker.
 pub(crate) struct SchedulerEnv {
@@ -79,13 +73,7 @@ impl SchedulerEnv {
    ) -> CompactionScheduler {
        let scheduler = self.get_scheduler();
-        CompactionScheduler::new(
+        CompactionScheduler::new(scheduler, request_sender, Arc::new(CacheManager::default()))
            scheduler,
            request_sender,
            Arc::new(CacheManager::default()),
            Arc::new(MitoConfig::default()),
            WorkerListener::default(),
        )
    }
    /// Creates a new flush scheduler.
@@ -95,27 +83,6 @@ impl SchedulerEnv {
        FlushScheduler::new(scheduler)
    }
    /// Creates a new manifest context.
    pub(crate) async fn mock_manifest_context(
        &self,
        metadata: RegionMetadataRef,
    ) -> ManifestContextRef {
        Arc::new(ManifestContext::new(
            RegionManifestManager::new(
                metadata,
                RegionManifestOptions {
                    manifest_dir: "".to_string(),
                    object_store: self.access_layer.object_store().clone(),
                    compress_type: CompressionType::Uncompressed,
                    checkpoint_distance: 10,
                },
            )
            .await
            .unwrap(),
            RegionState::Writable,
        ))
    }
    fn get_scheduler(&self) -> SchedulerRef {
        self.scheduler
            .clone()
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -21,7 +21,6 @@ mod handle_compaction;
 mod handle_create;
 mod handle_drop;
 mod handle_flush;
 mod handle_manifest;
 mod handle_open;
 mod handle_truncate;
 mod handle_write;
@@ -46,8 +45,9 @@ use crate::cache::write_cache::{WriteCache, WriteCacheRef};
 use crate::cache::{CacheManager, CacheManagerRef};
 use crate::compaction::CompactionScheduler;
 use crate::config::MitoConfig;
-use crate::error::{JoinSnafu, Result, WorkerStoppedSnafu};
+use crate::error::{InvalidRequestSnafu, JoinSnafu, Result, WorkerStoppedSnafu};
 use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef};
 use crate::manifest::action::RegionEdit;
 use crate::memtable::MemtableBuilderProvider;
 use crate::region::{MitoRegionRef, RegionMap, RegionMapRef};
 use crate::request::{
@@ -367,7 +367,7 @@ impl<S: LogStore> WorkerStarter<S> {
            running: running.clone(),
            memtable_builder_provider: MemtableBuilderProvider::new(
                Some(self.write_buffer_manager.clone()),
-                self.config.clone(),
+                self.config,
            ),
            purge_scheduler: self.purge_scheduler.clone(),
            write_buffer_manager: self.write_buffer_manager,
@@ -376,8 +376,6 @@ impl<S: LogStore> WorkerStarter<S> {
                self.scheduler,
                sender.clone(),
                self.cache_manager.clone(),
                self.config,
                self.listener.clone(),
            ),
            stalled_requests: StalledRequests::default(),
            listener: self.listener,
@@ -624,7 +622,10 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                    edit,
                    tx,
                } => {
-                    self.handle_region_edit(region_id, edit, tx).await;
+                    let result = self.edit_region(region_id, edit).await;
                    if let Err(Err(e)) = tx.send(result) {
                        warn!("Failed to send edit region error to caller, error: {e:?}");
                    }
                }
                // We receive a stop signal, but we still want to process remaining
                // requests. The worker thread will then check the running flag and
@@ -668,11 +669,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                    self.handle_compaction_request(ddl.region_id, ddl.sender);
                    continue;
                }
-                DdlRequest::Truncate(_) => {
+                DdlRequest::Truncate(_) => self.handle_truncate_request(ddl.region_id).await,
                    self.handle_truncate_request(ddl.region_id, ddl.sender)
                        .await;
                    continue;
                }
                DdlRequest::Catchup(req) => self.handle_catchup_request(ddl.region_id, req).await,
            };
@@ -709,7 +706,6 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                self.handle_compaction_finished(region_id, req).await
            }
            BackgroundNotify::CompactionFailed(req) => self.handle_compaction_failure(req).await,
            BackgroundNotify::Truncate(req) => self.handle_truncate_result(req).await,
        }
    }
@@ -720,17 +716,35 @@ impl<S: LogStore> RegionWorkerLoop<S> {
        sender: oneshot::Sender<SetReadonlyResponse>,
    ) {
        if let Some(region) = self.regions.get_region(region_id) {
-            // We need to do this in background as we need the manifest lock.
+            region.set_writable(false);
            common_runtime::spawn_bg(async move {
                region.set_readonly_gracefully().await;
            let last_entry_id = region.version_control.current().last_entry_id;
            let _ = sender.send(SetReadonlyResponse::success(Some(last_entry_id)));
            });
        } else {
            let _ = sender.send(SetReadonlyResponse::NotFound);
        }
    }
    async fn edit_region(&self, region_id: RegionId, edit: RegionEdit) -> Result<()> {
        let region = self.regions.writable_region(region_id)?;
        for file_meta in &edit.files_to_add {
            let is_exist = region.access_layer.is_exist(file_meta).await?;
            ensure!(
                is_exist,
                InvalidRequestSnafu {
                    region_id,
                    reason: format!(
                        "trying to add a not exist file '{}' when editing region",
                        file_meta.file_id
                    )
                }
            );
        }
        // Applying region edit directly has nothing to do with memtables (at least for now).
        region.apply_edit(edit, &[]).await
    }
 }
 impl<S> RegionWorkerLoop<S> {
@@ -739,7 +753,9 @@ impl<S> RegionWorkerLoop<S> {
        // Closes remaining regions.
        let regions = self.regions.list_regions();
        for region in regions {
-            region.stop().await;
+            if let Err(e) = region.stop().await {
                error!(e; "Failed to stop region {}", region.region_id);
            }
        }
        self.regions.clear();
@@ -809,10 +825,10 @@ impl WorkerListener {
        let _ = removed;
    }
-    pub(crate) async fn on_merge_ssts_finished(&self, region_id: RegionId) {
+    pub(crate) async fn on_handle_compaction_finished(&self, region_id: RegionId) {
        #[cfg(any(test, feature = "test"))]
        if let Some(listener) = &self.listener {
-            listener.on_merge_ssts_finished(region_id).await;
+            listener.on_handle_compaction_finished(region_id).await;
        }
        // Avoid compiler warning.
        let _ = region_id;
--- a/src/mito2/src/worker/handle_alter.rs
+++ b/src/mito2/src/worker/handle_alter.rs
@@ -16,7 +16,7 @@
 use std::sync::Arc;
-use common_telemetry::{debug, info};
+use common_telemetry::{debug, error, info};
 use snafu::ResultExt;
 use store_api::metadata::{RegionMetadata, RegionMetadataBuilder, RegionMetadataRef};
 use store_api::region_request::RegionAlterRequest;
@@ -26,7 +26,9 @@ use crate::error::{
    InvalidMetadataSnafu, InvalidRegionRequestSchemaVersionSnafu, InvalidRegionRequestSnafu, Result,
 };
 use crate::flush::FlushReason;
-use crate::manifest::action::RegionChange;
+use crate::manifest::action::{RegionChange, RegionMetaAction, RegionMetaActionList};
 use crate::region::version::Version;
 use crate::region::MitoRegionRef;
 use crate::request::{DdlRequest, OptionOutputTx, SenderDdlRequest};
 use crate::worker::RegionWorkerLoop;
@@ -105,26 +107,49 @@ impl<S> RegionWorkerLoop<S> {
            return;
        }
        // Now we can alter the region directly.
        if let Err(e) = alter_region_schema(&region, &version, request).await {
            error!(e; "Failed to alter region schema, region_id: {}", region_id);
            sender.send(Err(e));
            return;
        }
        info!(
-            "Try to alter region {} from version {} to {}",
+            "Schema of region {} is altered from {} to {}",
            region_id,
            version.metadata.schema_version,
            region.metadata().schema_version
        );
-        let new_meta = match metadata_after_alteration(&version.metadata, request) {
+        // Notifies waiters.
-            Ok(new_meta) => new_meta,
+        sender.send(Ok(0));
            Err(e) => {
                sender.send(Err(e));
                return;
    }
-        };
+}
 /// Alter the schema of the region.
 async fn alter_region_schema(
    region: &MitoRegionRef,
    version: &Version,
    request: RegionAlterRequest,
 ) -> Result<()> {
    let new_meta = metadata_after_alteration(&version.metadata, request)?;
    // Persist the metadata to region's manifest.
    let change = RegionChange {
        metadata: new_meta.clone(),
    };
-        self.handle_manifest_region_change(region, change, sender)
+    let action_list = RegionMetaActionList::with_action(RegionMetaAction::Change(change));
-    }
+    region
        .manifest_manager
        .write()
        .await
        .update(action_list)
        .await?;
    // Apply the metadata to region's version.
    region
        .version_control
        .alter_schema(new_meta, &region.memtable_builder);
    Ok(())
 }
 /// Creates a metadata after applying the alter `request` to the old `metadata`.
--- a/src/mito2/src/worker/handle_catchup.rs
+++ b/src/mito2/src/worker/handle_catchup.rs
@@ -45,7 +45,8 @@ impl<S: LogStore> RegionWorkerLoop<S> {
        let is_mutable_empty = region.version().memtables.mutable.is_empty();
        // Utilizes the short circuit evaluation.
-        let region = if !is_mutable_empty || region.manifest_ctx.has_update().await? {
+        let region =
            if !is_mutable_empty || region.manifest_manager.read().await.has_update().await? {
                info!("Reopening the region: {region_id}, empty mutable: {is_mutable_empty}");
                let reopened_region = Arc::new(
                    RegionOpener::new(
--- a/src/mito2/src/worker/handle_close.rs
+++ b/src/mito2/src/worker/handle_close.rs
@@ -33,7 +33,7 @@ impl<S> RegionWorkerLoop<S> {
        info!("Try to close region {}", region_id);
-        region.stop().await;
+        region.stop().await?;
        self.regions.remove_region(region_id);
        // Clean flush status.
        self.flush_scheduler.on_region_closed(region_id);
--- a/src/mito2/src/worker/handle_compaction.rs
+++ b/src/mito2/src/worker/handle_compaction.rs
@@ -16,8 +16,9 @@ use common_telemetry::{error, info, warn};
 use store_api::logstore::LogStore;
 use store_api::storage::RegionId;
-use crate::metrics::COMPACTION_REQUEST_COUNT;
+use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
-use crate::request::{CompactionFailed, CompactionFinished, OptionOutputTx};
+use crate::metrics::{COMPACTION_REQUEST_COUNT, COMPACTION_STAGE_ELAPSED};
 use crate::request::{CompactionFailed, CompactionFinished, OnFailure, OptionOutputTx};
 use crate::worker::RegionWorkerLoop;
 impl<S: LogStore> RegionWorkerLoop<S> {
@@ -37,7 +38,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            &region.access_layer,
            &region.file_purger,
            sender,
-            &region.manifest_ctx,
+            self.config.clone(),
        ) {
            error!(e; "Failed to schedule compaction task for region: {}", region_id);
        } else {
@@ -54,6 +55,8 @@ impl<S: LogStore> RegionWorkerLoop<S> {
        region_id: RegionId,
        mut request: CompactionFinished,
    ) {
        self.listener.on_handle_compaction_finished(region_id).await;
        let Some(region) = self.regions.writable_region_or(region_id, &mut request) else {
            warn!(
                "Unable to finish the compaction task for a read only region {}",
@@ -62,12 +65,44 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            return;
        };
        {
            let manifest_timer = COMPACTION_STAGE_ELAPSED
                .with_label_values(&["write_manifest"])
                .start_timer();
            // Write region edit to manifest.
            let edit = RegionEdit {
                files_to_add: std::mem::take(&mut request.compaction_outputs),
                files_to_remove: std::mem::take(&mut request.compacted_files),
                compaction_time_window: request.compaction_time_window,
                flushed_entry_id: None,
                flushed_sequence: None,
            };
            let action_list =
                RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
            if let Err(e) = region
                .manifest_manager
                .write()
                .await
                .update(action_list)
                .await
            {
                error!(e; "Failed to update manifest, region: {}", region_id);
                manifest_timer.stop_and_discard();
                request.on_failure(e);
                return;
            }
            // Apply edit to region's version.
            region
                .version_control
                .apply_edit(edit, &[], region.file_purger.clone());
        }
        // compaction finished.
        request.on_success();
        // Schedule next compaction if necessary.
        self.compaction_scheduler
-            .on_compaction_finished(region_id, &region.manifest_ctx);
+            .on_compaction_finished(region_id, self.config.clone());
    }
    /// When compaction fails, we simply log the error.
--- a/src/mito2/src/worker/handle_drop.rs
+++ b/src/mito2/src/worker/handle_drop.rs
@@ -16,7 +16,7 @@
 use std::time::Duration;
-use common_telemetry::{error, info, warn};
+use common_telemetry::{info, warn};
 use futures::TryStreamExt;
 use object_store::util::join_path;
 use object_store::{EntryMode, ObjectStore};
@@ -27,7 +27,7 @@ use tokio::time::sleep;
 use crate::error::{OpenDalSnafu, Result};
 use crate::metrics::REGION_COUNT;
-use crate::region::{RegionMapRef, RegionState};
+use crate::region::RegionMapRef;
 use crate::worker::{RegionWorkerLoop, DROPPING_MARKER_FILE};
 const GC_TASK_INTERVAL_SEC: u64 = 5 * 60; // 5 minutes
@@ -42,27 +42,17 @@ impl<S> RegionWorkerLoop<S> {
        info!("Try to drop region: {}", region_id);
-        // Marks the region as dropping.
+        // write dropping marker
        region.set_dropping()?;
        // Writes dropping marker
        // We rarely drop a region so we still operate in the worker loop.
        let marker_path = join_path(region.access_layer.region_dir(), DROPPING_MARKER_FILE);
        region
            .access_layer
            .object_store()
            .write(&marker_path, vec![])
            .await
-            .context(OpenDalSnafu)
+            .context(OpenDalSnafu)?;
            .inspect_err(|e| {
                error!(e; "Failed to write the drop marker file for region {}", region_id);
-                // Sets the state back to writable. It's possible that the marker file has been written.
+        region.stop().await?;
-                // We sets the state back to writable so we can retry the drop operation.
+        // remove this region from region map to prevent other requests from accessing this region
                region.switch_state_to_writable(RegionState::Dropping);
            })?;
        region.stop().await;
        // Removes this region from region map to prevent other requests from accessing this region
        self.regions.remove_region(region_id);
        self.dropping_regions.insert_region(region.clone());
        // Notifies flush scheduler.
@@ -70,7 +60,7 @@ impl<S> RegionWorkerLoop<S> {
        // Notifies compaction scheduler.
        self.compaction_scheduler.on_region_dropped(region_id);
-        // Marks region version as dropped
+        // mark region version as dropped
        region
            .version_control
            .mark_dropped(&region.memtable_builder);
@@ -81,7 +71,7 @@ impl<S> RegionWorkerLoop<S> {
        REGION_COUNT.dec();
-        // Detaches a background task to delete the region dir
+        // detach a background task to delete the region dir
        let region_dir = region.access_layer.region_dir().to_owned();
        let object_store = region.access_layer.object_store().clone();
        let dropping_regions = self.dropping_regions.clone();
--- a/src/mito2/src/worker/handle_flush.rs
+++ b/src/mito2/src/worker/handle_flush.rs
@@ -22,8 +22,9 @@ use store_api::region_request::RegionFlushRequest;
 use store_api::storage::RegionId;
 use crate::config::MitoConfig;
-use crate::error::Result;
+use crate::error::{RegionTruncatedSnafu, Result};
 use crate::flush::{FlushReason, RegionFlushTask};
 use crate::manifest::action::RegionEdit;
 use crate::region::MitoRegionRef;
 use crate::request::{FlushFailed, FlushFinished, OnFailure, OptionOutputTx};
 use crate::worker::RegionWorkerLoop;
@@ -177,7 +178,6 @@ impl<S> RegionWorkerLoop<S> {
            engine_config,
            row_group_size,
            cache_manager: self.cache_manager.clone(),
            manifest_ctx: region.manifest_ctx.clone(),
            index_options: region.version().options.index_options.clone(),
        }
    }
@@ -198,6 +198,29 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            return;
        };
        // The flush task before truncating the region fails immediately.
        let version_data = region.version_control.current();
        if let Some(truncated_entry_id) = version_data.version.truncated_entry_id {
            if truncated_entry_id >= request.flushed_entry_id {
                request.on_failure(RegionTruncatedSnafu { region_id }.build());
                return;
            }
        }
        // Write region edit to manifest.
        let edit = RegionEdit {
            files_to_add: std::mem::take(&mut request.file_metas),
            files_to_remove: Vec::new(),
            compaction_time_window: None,
            flushed_entry_id: Some(request.flushed_entry_id),
            flushed_sequence: Some(request.flushed_sequence),
        };
        if let Err(e) = region.apply_edit(edit, &request.memtables_to_remove).await {
            error!(e; "Failed to write manifest, region: {}", region_id);
            request.on_failure(e);
            return;
        }
        region.update_flush_millis();
        // Delete wal.
@@ -240,7 +263,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            &region.access_layer,
            &region.file_purger,
            OptionOutputTx::none(),
-            &region.manifest_ctx,
+            self.config.clone(),
        ) {
            warn!(
                "Failed to schedule compaction after flush, region: {}, err: {}",
--- a/src/mito2/src/worker/handle_manifest.rs
+++ b/src/mito2/src/worker/handle_manifest.rs
@@ -1,200 +0,0 @@
 // Copyright 2023 Greptime Team
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //! Handles manifest.
 //!
 //! It updates the manifest and applies the changes to the region in background.
 use common_telemetry::{info, warn};
 use snafu::ensure;
 use store_api::storage::RegionId;
 use tokio::sync::oneshot::Sender;
 use crate::error::{InvalidRequestSnafu, Result};
 use crate::manifest::action::{
    RegionChange, RegionEdit, RegionMetaAction, RegionMetaActionList, RegionTruncate,
 };
 use crate::region::{MitoRegionRef, RegionState};
 use crate::request::{BackgroundNotify, OptionOutputTx, TruncateResult, WorkerRequest};
 use crate::worker::RegionWorkerLoop;
 impl<S> RegionWorkerLoop<S> {
    /// Handles region edit request.
    pub(crate) async fn handle_region_edit(
        &self,
        region_id: RegionId,
        edit: RegionEdit,
        sender: Sender<Result<()>>,
    ) {
        let region = match self.regions.writable_region(region_id) {
            Ok(region) => region,
            Err(e) => {
                let _ = sender.send(Err(e));
                return;
            }
        };
        // Marks the region as editing.
        if let Err(e) = region.set_editing() {
            let _ = sender.send(Err(e));
            return;
        }
        // Now the region is in editing state.
        // Updates manifest in background.
        common_runtime::spawn_bg(async move {
            let result = edit_region(&region, edit).await;
            if let Err(res) = sender.send(result) {
                warn!(
                    "Failed to send result back to the worker, region_id: {}, res: {:?}",
                    region_id, res
                );
            }
            // Sets the region as writable. For simplicity, we don't send the result
            // back to the worker.
            region.switch_state_to_writable(RegionState::Editing);
        });
    }
    /// Writes truncate action to the manifest and then applies it to the region in background.
    pub(crate) fn handle_manifest_truncate_action(
        &self,
        region: MitoRegionRef,
        truncate: RegionTruncate,
        sender: OptionOutputTx,
    ) {
        // Marks the region as truncating.
        // This prevents the region from being accessed by other write requests.
        if let Err(e) = region.set_truncating() {
            sender.send(Err(e));
            return;
        }
        // Now the region is in truncating state.
        let request_sender = self.sender.clone();
        let manifest_ctx = region.manifest_ctx.clone();
        let version_control = region.version_control.clone();
        let memtable_builder = region.memtable_builder.clone();
        // Updates manifest in background.
        common_runtime::spawn_bg(async move {
            // Write region truncated to manifest.
            let action_list =
                RegionMetaActionList::with_action(RegionMetaAction::Truncate(truncate.clone()));
            let result = manifest_ctx
                .update_manifest(RegionState::Truncating, action_list, || {
                    // Applies the truncate action to the region.
                    version_control.truncate(
                        truncate.truncated_entry_id,
                        truncate.truncated_sequence,
                        &memtable_builder,
                    );
                })
                .await;
            // Sends the result back to the request sender.
            let truncate_result = TruncateResult {
                region_id: truncate.region_id,
                sender,
                result,
                truncated_entry_id: truncate.truncated_entry_id,
                truncated_sequence: truncate.truncated_sequence,
            };
            let _ = request_sender
                .send(WorkerRequest::Background {
                    region_id: truncate.region_id,
                    notify: BackgroundNotify::Truncate(truncate_result),
                })
                .await
                .inspect_err(|_| warn!("failed to send truncate result"));
        });
    }
    /// Writes region change action to the manifest and then applies it to the region in background.
    pub(crate) fn handle_manifest_region_change(
        &self,
        region: MitoRegionRef,
        change: RegionChange,
        sender: OptionOutputTx,
    ) {
        // Marks the region as altering.
        if let Err(e) = region.set_altering() {
            sender.send(Err(e));
            return;
        }
        // Now the region is in altering state.
        common_runtime::spawn_bg(async move {
            let new_meta = change.metadata.clone();
            let action_list = RegionMetaActionList::with_action(RegionMetaAction::Change(change));
            let result = region
                .manifest_ctx
                .update_manifest(RegionState::Altering, action_list, || {
                    // Apply the metadata to region's version.
                    region
                        .version_control
                        .alter_schema(new_meta, &region.memtable_builder);
                })
                .await;
            // Sets the region as writable.
            region.switch_state_to_writable(RegionState::Altering);
            if result.is_ok() {
                info!(
                    "Region {} is altered, schema version is {}",
                    region.region_id,
                    region.metadata().schema_version
                );
            }
            sender.send(result.map(|_| 0));
        });
    }
 }
 /// Checks the edit, writes and applies it.
 async fn edit_region(region: &MitoRegionRef, edit: RegionEdit) -> Result<()> {
    let region_id = region.region_id;
    for file_meta in &edit.files_to_add {
        let is_exist = region.access_layer.is_exist(file_meta).await?;
        ensure!(
            is_exist,
            InvalidRequestSnafu {
                region_id,
                reason: format!(
                    "trying to add a not exist file '{}' when editing region",
                    file_meta.file_id
                )
            }
        );
    }
    info!("Applying {edit:?} to region {}", region_id);
    let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
    region
        .manifest_ctx
        .update_manifest(RegionState::Editing, action_list, || {
            // Applies the edit to the region.
            region
                .version_control
                .apply_edit(edit, &[], region.file_purger.clone());
        })
        .await
 }
--- a/src/mito2/src/worker/handle_truncate.rs
+++ b/src/mito2/src/worker/handle_truncate.rs
@@ -16,23 +16,19 @@
 use common_telemetry::info;
 use store_api::logstore::LogStore;
 use store_api::region_request::AffectedRows;
 use store_api::storage::RegionId;
-use crate::error::RegionNotFoundSnafu;
+use crate::error::Result;
-use crate::manifest::action::RegionTruncate;
+use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionTruncate};
 use crate::region::RegionState;
 use crate::request::{OptionOutputTx, TruncateResult};
 use crate::worker::RegionWorkerLoop;
 impl<S: LogStore> RegionWorkerLoop<S> {
    pub(crate) async fn handle_truncate_request(
        &mut self,
        region_id: RegionId,
-        mut sender: OptionOutputTx,
+    ) -> Result<AffectedRows> {
-    ) {
+        let region = self.regions.writable_region(region_id)?;
        let Some(region) = self.regions.writable_region_or(region_id, &mut sender) else {
            return;
        };
        info!("Try to truncate region {}", region_id);
@@ -46,55 +42,36 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            truncated_entry_id,
            truncated_sequence,
        };
-        self.handle_manifest_truncate_action(region, truncate, sender);
+        let action_list =
-    }
+            RegionMetaActionList::with_action(RegionMetaAction::Truncate(truncate.clone()));
-
+        region
-    /// Handles truncate result.
+            .manifest_manager
-    pub(crate) async fn handle_truncate_result(&mut self, truncate_result: TruncateResult) {
+            .write()
-        let region_id = truncate_result.region_id;
+            .await
-        let Some(region) = self.regions.get_region(region_id) else {
+            .update(action_list)
-            truncate_result.sender.send(
+            .await?;
                RegionNotFoundSnafu {
                    region_id: truncate_result.region_id,
                }
                .fail(),
            );
            return;
        };
        // We are already in the worker loop so we can set the state first.
        region.switch_state_to_writable(RegionState::Truncating);
        if let Err(e) = truncate_result.result {
            // Unable to truncate the region.
            truncate_result.sender.send(Err(e));
            return;
        }
        // Notifies flush scheduler.
        self.flush_scheduler.on_region_truncated(region_id);
        // Notifies compaction scheduler.
        self.compaction_scheduler.on_region_truncated(region_id);
-        // Make all data obsolete.
+        // Reset region's version and mark all SSTs deleted.
-        if let Err(e) = self
+        region.version_control.truncate(
-            .wal
+            truncated_entry_id,
-            .obsolete(
+            truncated_sequence,
-                region_id,
+            &region.memtable_builder,
                truncate_result.truncated_entry_id,
                &region.wal_options,
            )
            .await
        {
            truncate_result.sender.send(Err(e));
            return;
        }
        info!(
            "Complete truncating region: {}, entry id: {} and sequence: {}.",
            region_id, truncate_result.truncated_entry_id, truncate_result.truncated_sequence
        );
-        truncate_result.sender.send(Ok(0));
+        // Make all data obsolete.
        self.wal
            .obsolete(region_id, truncated_entry_id, &region.wal_options)
            .await?;
        info!(
            "Complete truncating region: {}, entry id: {} and sequence: {}.",
            region_id, truncated_entry_id, truncated_sequence
        );
        Ok(0)
    }
 }
--- a/src/operator/Cargo.toml
+++ b/src/operator/Cargo.toml
@@ -52,6 +52,7 @@ snafu.workspace = true
 sql.workspace = true
 sqlparser.workspace = true
 store-api.workspace = true
 substrait.workspace = true
 table.workspace = true
 tokio.workspace = true
 tonic.workspace = true
--- a/src/operator/src/error.rs
+++ b/src/operator/src/error.rs
@@ -541,6 +541,12 @@ pub enum Error {
        end: String,
        location: Location,
    },
    #[snafu(display("Failed to convert between logical plan and substrait plan"))]
    SubstraitCodec {
        location: Location,
        source: substrait::error::Error,
    },
 }
 pub type Result<T> = std::result::Result<T, Error>;
@@ -597,6 +603,7 @@ impl ErrorExt for Error {
            Error::RequestInserts { source, .. } => source.status_code(),
            Error::RequestRegion { source, .. } => source.status_code(),
            Error::RequestDeletes { source, .. } => source.status_code(),
            Error::SubstraitCodec { source, .. } => source.status_code(),
            Error::ColumnDataType { source, .. } | Error::InvalidColumnDef { source, .. } => {
                source.status_code()
--- a/src/operator/src/expr_factory.rs
+++ b/src/operator/src/expr_factory.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use api::helper::ColumnDataTypeWrapper;
 use api::v1::alter_expr::Kind;
@@ -31,11 +31,12 @@ use query::sql::{
 };
 use session::context::QueryContextRef;
 use session::table_name::table_idents_to_full_name;
-use snafu::{ensure, OptionExt, ResultExt};
+use snafu::{ensure, ResultExt};
 use sql::ast::{ColumnDef, ColumnOption, TableConstraint};
 use sql::statements::alter::{AlterTable, AlterTableOperation};
 use sql::statements::create::{CreateExternalTable, CreateTable, TIME_INDEX};
 use sql::statements::{column_def_to_schema, sql_column_def_to_grpc_column_def};
 use sql::util::to_lowercase_options_map;
 use table::requests::{TableOptions, FILE_TABLE_META_KEY};
 use table::table_reference::TableReference;
@@ -189,7 +190,8 @@ pub fn create_to_expr(create: &CreateTable, query_ctx: QueryContextRef) -> Resul
    let time_index = find_time_index(&create.constraints)?;
    let table_options = HashMap::from(
-        &TableOptions::try_from(create.options.as_ref()).context(UnrecognizedTableOptionSnafu)?,
+        &TableOptions::try_from(&to_lowercase_options_map(&create.options))
            .context(UnrecognizedTableOptionSnafu)?,
    );
    let primary_keys = find_primary_keys(&create.columns, &create.constraints)?;
@@ -212,72 +214,9 @@ pub fn create_to_expr(create: &CreateTable, query_ctx: QueryContextRef) -> Resul
        table_id: None,
        engine: create.engine.to_string(),
    };
    validate_create_expr(&expr)?;
    Ok(expr)
 }
 /// Validate the [`CreateTableExpr`] request.
 pub fn validate_create_expr(create: &CreateTableExpr) -> Result<()> {
    // construct column list
    let mut column_to_indices = HashMap::with_capacity(create.column_defs.len());
    for (idx, column) in create.column_defs.iter().enumerate() {
        if let Some(indices) = column_to_indices.get(&column.name) {
            return InvalidSqlSnafu {
                err_msg: format!(
                    "column name `{}` is duplicated at index {} and {}",
                    column.name, indices, idx
                ),
            }
            .fail();
        }
        column_to_indices.insert(&column.name, idx);
    }
    // verify time_index exists
    let _ = column_to_indices
        .get(&create.time_index)
        .with_context(|| InvalidSqlSnafu {
            err_msg: format!(
                "column name `{}` is not found in column list",
                create.time_index
            ),
        })?;
    // verify primary_key exists
    for pk in &create.primary_keys {
        let _ = column_to_indices
            .get(&pk)
            .with_context(|| InvalidSqlSnafu {
                err_msg: format!("column name `{}` is not found in column list", pk),
            })?;
    }
    // construct primary_key set
    let mut pk_set = HashSet::new();
    for pk in &create.primary_keys {
        if !pk_set.insert(pk) {
            return InvalidSqlSnafu {
                err_msg: format!("column name `{}` is duplicated in primary keys", pk),
            }
            .fail();
        }
    }
    // verify time index is not primary key
    if pk_set.contains(&create.time_index) {
        return InvalidSqlSnafu {
            err_msg: format!(
                "column name `{}` is both primary key and time index",
                create.time_index
            ),
        }
        .fail();
    }
    Ok(())
 }
 fn find_primary_keys(
    columns: &[ColumnDef],
    constraints: &[TableConstraint],
@@ -500,7 +439,7 @@ mod tests {
    #[test]
    fn test_create_to_expr() {
-        let sql = "CREATE TABLE monitor (host STRING,ts TIMESTAMP,TIME INDEX (ts),PRIMARY KEY(host)) ENGINE=mito WITH(ttl='3days', write_buffer_size='1024KB');";
+        let sql = "CREATE TABLE monitor (host STRING,ts TIMESTAMP,TIME INDEX (ts),PRIMARY KEY(host)) ENGINE=mito WITH(regions=1, ttl='3days', write_buffer_size='1024KB');";
        let stmt =
            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
                .unwrap()
@@ -518,33 +457,6 @@ mod tests {
        );
    }
    #[test]
    fn test_invalid_create_to_expr() {
        let cases = [
            // duplicate column declaration
            "CREATE TABLE monitor (host STRING primary key, ts TIMESTAMP TIME INDEX, some_column text, some_column string);",
            // duplicate primary key
            "CREATE TABLE monitor (host STRING, ts TIMESTAMP TIME INDEX, some_column STRING, PRIMARY KEY (some_column, host, some_column));",
            // time index is primary key
            "CREATE TABLE monitor (host STRING, ts TIMESTAMP TIME INDEX, PRIMARY KEY (host, ts));"
        ];
        for sql in cases {
            let stmt = ParserContext::create_with_dialect(
                sql,
                &GreptimeDbDialect {},
                ParseOptions::default(),
            )
            .unwrap()
            .pop()
            .unwrap();
            let Statement::CreateTable(create_table) = stmt else {
                unreachable!()
            };
            create_to_expr(&create_table, QueryContext::arc()).unwrap_err();
        }
    }
    #[test]
    fn test_create_to_expr_with_default_timestamp_value() {
        let sql = "CREATE TABLE monitor (v double,ts TIMESTAMP default '2024-01-30T00:01:01',TIME INDEX (ts)) engine=mito;";
--- a/src/operator/src/statement.rs
+++ b/src/operator/src/statement.rs
@@ -164,6 +164,10 @@ impl StatementExecutor {
                let _ = self.create_external_table(stmt, query_ctx).await?;
                Ok(Output::new_with_affected_rows(0))
            }
            Statement::CreateView(stmt) => {
                let _ = self.create_view(stmt, query_ctx).await?;
                Ok(Output::new_with_affected_rows(0))
            }
            Statement::Alter(alter_table) => self.alter_table(alter_table, query_ctx).await,
            Statement::DropTable(stmt) => {
                let (catalog, schema, table) =
@@ -256,6 +260,13 @@ impl StatementExecutor {
            .context(PlanStatementSnafu)
    }
    pub fn optimize_logical_plan(&self, plan: LogicalPlan) -> Result<LogicalPlan> {
        self.query_engine
            .planner()
            .optimize(plan)
            .context(PlanStatementSnafu)
    }
    #[tracing::instrument(skip_all)]
    async fn plan_exec(&self, stmt: QueryStatement, query_ctx: QueryContextRef) -> Result<Output> {
        let plan = self.plan(stmt, query_ctx.clone()).await?;
--- a/src/operator/src/statement/ddl.rs
+++ b/src/operator/src/statement/ddl.rs
@@ -39,16 +39,21 @@ use datatypes::value::Value;
 use lazy_static::lazy_static;
 use partition::expr::{Operand, PartitionExpr, RestrictedOp};
 use partition::partition::{PartitionBound, PartitionDef};
 use query::parser::QueryStatement;
 use query::sql::create_table_stmt;
 use regex::Regex;
 use session::context::QueryContextRef;
 use session::table_name::table_idents_to_full_name;
 use snafu::{ensure, IntoError, OptionExt, ResultExt};
 use sql::statements::alter::AlterTable;
-use sql::statements::create::{CreateExternalTable, CreateTable, CreateTableLike, Partitions};
+use sql::statements::create::{
    CreateExternalTable, CreateTable, CreateTableLike, CreateView, Partitions,
 };
 use sql::statements::sql_value_to_value;
 use sql::statements::statement::Statement;
 use sqlparser::ast::{Expr, Ident, Value as ParserValue};
 use store_api::metric_engine_consts::{LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME};
 use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 use table::dist_table::DistTable;
 use table::metadata::{self, RawTableInfo, RawTableMeta, TableId, TableInfo, TableType};
 use table::requests::{AlterKind, AlterTableRequest, TableOptions};
@@ -60,7 +65,7 @@ use crate::error::{
    CreateLogicalTablesSnafu, CreateTableInfoSnafu, DdlWithMultiCatalogsSnafu,
    DdlWithMultiSchemasSnafu, DeserializePartitionSnafu, EmptyDdlExprSnafu,
    InvalidPartitionColumnsSnafu, InvalidPartitionRuleSnafu, InvalidTableNameSnafu,
-    ParseSqlValueSnafu, Result, SchemaNotFoundSnafu, TableAlreadyExistsSnafu,
+    ParseSqlValueSnafu, Result, SchemaNotFoundSnafu, SubstraitCodecSnafu, TableAlreadyExistsSnafu,
    TableMetadataManagerSnafu, TableNotFoundSnafu, UnrecognizedTableOptionSnafu,
 };
 use crate::expr_factory;
@@ -320,6 +325,33 @@ impl StatementExecutor {
            .collect())
    }
    #[tracing::instrument(skip_all)]
    pub async fn create_view(
        &self,
        create_view: CreateView,
        ctx: QueryContextRef,
    ) -> Result<TableRef> {
        // convert input into logical plan
        let logical_plan = match *create_view.input {
            Statement::Query(query) => {
                self.plan(QueryStatement::Sql(Statement::Query(query)), ctx)
                    .await?
            }
            Statement::Tql(query) => self.plan_tql(query, &ctx).await?,
            _ => {
                todo!("throw an error")
            }
        };
        let optimized_plan = self.optimize_logical_plan(logical_plan)?;
        // encode logical plan
        let encoded_plan = DFLogicalSubstraitConvertor
            .encode(&optimized_plan.unwrap_df_plan())
            .context(SubstraitCodecSnafu)?;
        todo!()
    }
    #[tracing::instrument(skip_all)]
    pub async fn alter_logical_tables(&self, alter_table_exprs: Vec<AlterExpr>) -> Result<Output> {
        let _timer = crate::metrics::DIST_ALTER_TABLES.start_timer();
--- a/src/operator/src/statement/tql.rs
+++ b/src/operator/src/statement/tql.rs
@@ -20,6 +20,7 @@ use query::parser::{
    PromQuery, QueryLanguageParser, ANALYZE_NODE_NAME, ANALYZE_VERBOSE_NODE_NAME,
    DEFAULT_LOOKBACK_STRING, EXPLAIN_NODE_NAME, EXPLAIN_VERBOSE_NODE_NAME,
 };
 use query::plan::LogicalPlan;
 use session::context::QueryContextRef;
 use snafu::ResultExt;
 use sql::statements::tql::Tql;
@@ -28,8 +29,9 @@ use crate::error::{ExecLogicalPlanSnafu, ParseQuerySnafu, PlanStatementSnafu, Re
 use crate::statement::StatementExecutor;
 impl StatementExecutor {
    /// Plan the given [Tql] query and return the [LogicalPlan].
    #[tracing::instrument(skip_all)]
-    pub(super) async fn execute_tql(&self, tql: Tql, query_ctx: QueryContextRef) -> Result<Output> {
+    pub async fn plan_tql(&self, tql: Tql, query_ctx: &QueryContextRef) -> Result<LogicalPlan> {
        let stmt = match tql {
            Tql::Eval(eval) => {
                let promql = PromQuery {
@@ -86,12 +88,17 @@ impl StatementExecutor {
                    .unwrap()
            }
        };
-        let plan = self
+        self.query_engine
            .query_engine
            .planner()
            .plan(stmt, query_ctx.clone())
            .await
-            .context(PlanStatementSnafu)?;
+            .context(PlanStatementSnafu)
    }
    /// Execute the given [Tql] query and return the result.
    #[tracing::instrument(skip_all)]
    pub(super) async fn execute_tql(&self, tql: Tql, query_ctx: QueryContextRef) -> Result<Output> {
        let plan = self.plan_tql(tql, &query_ctx).await?;
        self.query_engine
            .execute(plan, query_ctx)
            .await
--- a/src/promql/src/extension_plan/scalar_calculate.rs
+++ b/src/promql/src/extension_plan/scalar_calculate.rs
@@ -381,8 +381,8 @@ impl RecordBatchStream for ScalarCalculateStream {
 impl ScalarCalculateStream {
    fn update_batch(&mut self, batch: RecordBatch) -> DataFusionResult<()> {
        let _timer = self.metric.elapsed_compute();
-        // if have multi time series or empty batch, scalar will return NaN
+        // if have multi time series, scalar will return NaN
-        if self.have_multi_series || batch.num_rows() == 0 {
+        if self.have_multi_series {
            return Ok(());
        }
        // fast path: no tag columns means all data belongs to the same series.
@@ -493,18 +493,51 @@ mod test {
    use super::*;
-    fn prepare_test_data(series: Vec<RecordBatch>) -> MemoryExec {
+    fn prepare_test_data(diff_series: bool) -> MemoryExec {
        let schema = Arc::new(Schema::new(vec![
            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
            Field::new("tag1", DataType::Utf8, true),
            Field::new("tag2", DataType::Utf8, true),
            Field::new("val", DataType::Float64, true),
        ]));
-        MemoryExec::try_new(&[series], schema, None).unwrap()
+        let batch_1 = RecordBatch::try_new(
            schema.clone(),
            vec![
                Arc::new(TimestampMillisecondArray::from(vec![0, 5_000])),
                Arc::new(StringArray::from(vec!["foo", "foo"])),
                Arc::new(StringArray::from(vec!["🥺", "🥺"])),
                Arc::new(Float64Array::from(vec![1.0, 2.0])),
            ],
        )
        .unwrap();
        let batch_2 = if diff_series {
            RecordBatch::try_new(
                schema.clone(),
                vec![
                    Arc::new(TimestampMillisecondArray::from(vec![10_000, 15_000])),
                    Arc::new(StringArray::from(vec!["foo", "foo"])),
                    Arc::new(StringArray::from(vec!["🥺", "😝"])),
                    Arc::new(Float64Array::from(vec![3.0, 4.0])),
                ],
            )
            .unwrap()
        } else {
            RecordBatch::try_new(
                schema.clone(),
                vec![
                    Arc::new(TimestampMillisecondArray::from(vec![10_000, 15_000])),
                    Arc::new(StringArray::from(vec!["foo", "foo"])),
                    Arc::new(StringArray::from(vec!["🥺", "🥺"])),
                    Arc::new(Float64Array::from(vec![3.0, 4.0])),
                ],
            )
            .unwrap()
        };
        MemoryExec::try_new(&[vec![batch_1, batch_2]], schema, None).unwrap()
    }
-    async fn run_test(series: Vec<RecordBatch>, expected: &str) {
+    async fn run_test(diff_series: bool, expected: &str) {
-        let memory_exec = Arc::new(prepare_test_data(series));
+        let memory_exec = Arc::new(prepare_test_data(diff_series));
        let schema = Arc::new(Schema::new(vec![
            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
            Field::new("val", DataType::Float64, true),
@@ -537,35 +570,8 @@ mod test {
    #[tokio::test]
    async fn same_series() {
        let schema = Arc::new(Schema::new(vec![
            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
            Field::new("tag1", DataType::Utf8, true),
            Field::new("tag2", DataType::Utf8, true),
            Field::new("val", DataType::Float64, true),
        ]));
        run_test(
-            vec![
+            false,
                RecordBatch::try_new(
                    schema.clone(),
                    vec![
                        Arc::new(TimestampMillisecondArray::from(vec![0, 5_000])),
                        Arc::new(StringArray::from(vec!["foo", "foo"])),
                        Arc::new(StringArray::from(vec!["🥺", "🥺"])),
                        Arc::new(Float64Array::from(vec![1.0, 2.0])),
                    ],
                )
                .unwrap(),
                RecordBatch::try_new(
                    schema,
                    vec![
                        Arc::new(TimestampMillisecondArray::from(vec![10_000, 15_000])),
                        Arc::new(StringArray::from(vec!["foo", "foo"])),
                        Arc::new(StringArray::from(vec!["🥺", "🥺"])),
                        Arc::new(Float64Array::from(vec![3.0, 4.0])),
                    ],
                )
                .unwrap(),
            ],
            "+---------------------+-----+\
            \n| ts                  | val |\
            \n+---------------------+-----+\
@@ -580,66 +586,8 @@ mod test {
    #[tokio::test]
    async fn diff_series() {
        let schema = Arc::new(Schema::new(vec![
            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
            Field::new("tag1", DataType::Utf8, true),
            Field::new("tag2", DataType::Utf8, true),
            Field::new("val", DataType::Float64, true),
        ]));
        run_test(
-            vec![
+            true,
                RecordBatch::try_new(
                    schema.clone(),
                    vec![
                        Arc::new(TimestampMillisecondArray::from(vec![0, 5_000])),
                        Arc::new(StringArray::from(vec!["foo", "foo"])),
                        Arc::new(StringArray::from(vec!["🥺", "🥺"])),
                        Arc::new(Float64Array::from(vec![1.0, 2.0])),
                    ],
                )
                .unwrap(),
                RecordBatch::try_new(
                    schema,
                    vec![
                        Arc::new(TimestampMillisecondArray::from(vec![10_000, 15_000])),
                        Arc::new(StringArray::from(vec!["foo", "foo"])),
                        Arc::new(StringArray::from(vec!["🥺", "😝"])),
                        Arc::new(Float64Array::from(vec![3.0, 4.0])),
                    ],
                )
                .unwrap(),
            ],
            "+---------------------+-----+\
            \n| ts                  | val |\
            \n+---------------------+-----+\
            \n| 1970-01-01T00:00:00 | NaN |\
            \n| 1970-01-01T00:00:05 | NaN |\
            \n| 1970-01-01T00:00:10 | NaN |\
            \n| 1970-01-01T00:00:15 | NaN |\
            \n+---------------------+-----+",
        )
        .await
    }
    #[tokio::test]
    async fn empty_series() {
        let schema = Arc::new(Schema::new(vec![
            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
            Field::new("tag1", DataType::Utf8, true),
            Field::new("tag2", DataType::Utf8, true),
            Field::new("val", DataType::Float64, true),
        ]));
        run_test(
            vec![RecordBatch::try_new(
                schema,
                vec![
                    Arc::new(TimestampMillisecondArray::new_null(0)),
                    Arc::new(StringArray::new_null(0)),
                    Arc::new(StringArray::new_null(0)),
                    Arc::new(Float64Array::new_null(0)),
                ],
            )
            .unwrap()],
            "+---------------------+-----+\
            \n| ts                  | val |\
            \n+---------------------+-----+\
--- a/src/query/src/plan.rs
+++ b/src/query/src/plan.rs
@@ -87,6 +87,13 @@ impl LogicalPlan {
            .context(DataFusionSnafu)
            .map(LogicalPlan::DfPlan)
    }
    /// Unwrap the logical plan into a DataFusion logical plan
    pub fn unwrap_df_plan(self) -> DfLogicalPlan {
        match self {
            LogicalPlan::DfPlan(plan) => plan,
        }
    }
 }
 impl From<DfLogicalPlan> for LogicalPlan {
--- a/src/query/src/planner.rs
+++ b/src/query/src/planner.rs
@@ -42,6 +42,8 @@ use crate::{DfContextProviderAdapter, QueryEngineContext};
 pub trait LogicalPlanner: Send + Sync {
    async fn plan(&self, stmt: QueryStatement, query_ctx: QueryContextRef) -> Result<LogicalPlan>;
    fn optimize(&self, plan: LogicalPlan) -> Result<LogicalPlan>;
    fn as_any(&self) -> &dyn Any;
 }
@@ -145,6 +147,14 @@ impl DfLogicalPlanner {
            .map_err(BoxedError::new)
            .context(QueryPlanSnafu)
    }
    #[tracing::instrument(skip_all)]
    fn optimize_logical_plan(&self, plan: LogicalPlan) -> Result<LogicalPlan> {
        self.engine_state
            .optimize_logical_plan(plan.unwrap_df_plan())
            .context(DataFusionSnafu)
            .map(Into::into)
    }
 }
 #[async_trait]
@@ -157,6 +167,10 @@ impl LogicalPlanner for DfLogicalPlanner {
        }
    }
    fn optimize(&self, plan: LogicalPlan) -> Result<LogicalPlan> {
        self.optimize_logical_plan(plan)
    }
    fn as_any(&self) -> &dyn Any {
        self
    }
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -142,6 +142,11 @@ impl QueryEngineState {
            })
    }
    /// Run the full logical plan optimize phase for the given plan.
    pub fn optimize_logical_plan(&self, plan: DfLogicalPlan) -> DfResult<DfLogicalPlan> {
        self.session_state().optimize(&plan)
    }
    /// Register an udf function.
    /// Will override if the function with same name is already registered.
    pub fn register_function(&self, func: FunctionRef) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ruihang Xia	94409967be	Merge branch 'main' into create-view	2024-04-22 21:08:22 +08:00
Ruihang Xia	7503992d61	add statement Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-04-17 19:13:54 +08:00