fix: drop arc explictly

fix: conn pool leak & placeholder feature so ci can compile
fix: time window filter expr use OR
2026-01-04 20:32:56 +00:00 · 2025-04-10 15:17:22 +08:00 · 2025-04-10 14:47:17 +08:00 · 2025-03-27 15:38:35 +08:00 · 2025-03-13 16:33:14 +08:00 · 2025-03-13 15:29:10 +08:00
224 changed files with 11988 additions and 6629 deletions
--- a/.github/actions/build-dev-builder-images/action.yml
+++ b/.github/actions/build-dev-builder-images/action.yml
@@ -41,7 +41,14 @@ runs:
        username: ${{ inputs.dockerhub-image-registry-username }}
        password: ${{ inputs.dockerhub-image-registry-token }}

-    - name: Build and push dev-builder-ubuntu image
+    - name: Set up qemu for multi-platform builds
+      uses: docker/setup-qemu-action@v3
+      with:
+        platforms: linux/amd64,linux/arm64
+        # The latest version will lead to segmentation fault.
+        image: tonistiigi/binfmt:qemu-v7.0.0-28
+
+    - name: Build and push dev-builder-ubuntu image # Build image for amd64 and arm64 platform.
      shell: bash
      if: ${{ inputs.build-dev-builder-ubuntu == 'true' }}
      run: |
@@ -52,7 +59,7 @@ runs:
          IMAGE_NAMESPACE=${{ inputs.dockerhub-image-namespace }} \
          DEV_BUILDER_IMAGE_TAG=${{ inputs.version }}

-    - name: Build and push dev-builder-centos image
+    - name: Build and push dev-builder-centos image # Only build image for amd64 platform.
      shell: bash
      if: ${{ inputs.build-dev-builder-centos == 'true' }}
      run: |
@@ -69,8 +76,7 @@ runs:
      run: |
        make dev-builder \
          BASE_IMAGE=android \
+          BUILDX_MULTI_PLATFORM_BUILD=amd64 \
          IMAGE_REGISTRY=${{ inputs.dockerhub-image-registry }} \
          IMAGE_NAMESPACE=${{ inputs.dockerhub-image-namespace }} \
-          DEV_BUILDER_IMAGE_TAG=${{ inputs.version }} && \
-
-        docker push ${{ inputs.dockerhub-image-registry }}/${{ inputs.dockerhub-image-namespace }}/dev-builder-android:${{ inputs.version }}
+          DEV_BUILDER_IMAGE_TAG=${{ inputs.version }}
--- a/.github/workflows/grafana.yml
+++ b/.github/workflows/grafana.yml
@@ -0,0 +1,52 @@
+name: Check Grafana Panels
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'grafana/**'  # Trigger only when files under the grafana/ directory change
+
+jobs:
+  check-panels:
+    runs-on: ubuntu-latest
+
+    steps:
+      # Check out the repository
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      # Install jq (required for the script)
+      - name: Install jq
+        run: sudo apt-get install -y jq
+
+      # Make the check.sh script executable
+      - name: Make check.sh executable
+        run: chmod +x grafana/check.sh
+
+      # Run the check.sh script
+      - name: Run check.sh
+        run: ./grafana/check.sh
+
+      # Only run summary.sh for pull_request events (not for merge queues or final pushes)
+      - name: Check if this is a pull request
+        id: check-pr
+        run: |
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "is_pull_request=true" >> $GITHUB_OUTPUT
+          else
+            echo "is_pull_request=false" >> $GITHUB_OUTPUT
+          fi
+
+      # Make the summary.sh script executable
+      - name: Make summary.sh executable
+        if: steps.check-pr.outputs.is_pull_request == 'true'
+        run: chmod +x grafana/summary.sh
+
+      # Run the summary.sh script and add its output to the GitHub Job Summary
+      - name: Run summary.sh and add to Job Summary
+        if: steps.check-pr.outputs.is_pull_request == 'true'
+        run: |
+          SUMMARY=$(./grafana/summary.sh)
+          echo "### Summary of Grafana Panels" >> $GITHUB_STEP_SUMMARY
+          echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4167,6 +4167,7 @@ dependencies = [
 "bytes",
 "cache",
 "catalog",
+ "chrono",
 "client",
 "common-base",
 "common-catalog",
@@ -4701,7 +4702,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=072ce580502e015df1a6b03a185b60309a7c2a7a#072ce580502e015df1a6b03a185b60309a7c2a7a"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=d92c9ac4e90ef4abdcf5c2eaf5a164e18ba09486#d92c9ac4e90ef4abdcf5c2eaf5a164e18ba09486"
 dependencies = [
 "prost 0.13.3",
 "serde",
@@ -5566,6 +5567,7 @@ dependencies = [
 "rand",
 "regex",
 "regex-automata 0.4.8",
+ "roaring",
 "serde",
 "serde_json",
 "snafu 0.8.5",
@@ -5897,15 +5899,15 @@ dependencies = [

 [[package]]
 name = "jsonpath-rust"
-version = "0.7.3"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69a61b87f6a55cc6c28fed5739dd36b9642321ce63e4a5e4a4715d69106f4a10"
+checksum = "0c00ae348f9f8fd2d09f82a98ca381c60df9e0820d8d79fce43e649b4dc3128b"
 dependencies = [
 "pest",
 "pest_derive",
 "regex",
 "serde_json",
- "thiserror 1.0.64",
+ "thiserror 2.0.12",
 ]

 [[package]]
@@ -8270,7 +8272,7 @@ dependencies = [
 "rand",
 "ring",
 "rust_decimal",
- "thiserror 2.0.6",
+ "thiserror 2.0.12",
 "tokio",
 "tokio-rustls 0.26.0",
 "tokio-util",
@@ -8382,7 +8384,7 @@ dependencies = [
 "greptime-proto",
 "itertools 0.10.5",
 "jsonb",
- "jsonpath-rust 0.7.3",
+ "jsonpath-rust 0.7.5",
 "lazy_static",
 "moka",
 "once_cell",
@@ -8760,6 +8762,7 @@ dependencies = [
 "common-recordbatch",
 "common-telemetry",
 "datafusion",
+ "datafusion-common",
 "datafusion-expr",
 "datatypes",
 "futures",
@@ -8773,8 +8776,9 @@ dependencies = [

 [[package]]
 name = "promql-parser"
-version = "0.4.3"
-source = "git+https://github.com/GreptimeTeam/promql-parser.git?rev=27abb8e16003a50c720f00d6c85f41f5fa2a2a8e#27abb8e16003a50c720f00d6c85f41f5fa2a2a8e"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c6b1429bdd199d53bd58b745075c1652efedbe2746e5d4f0d56d3184dda48ec"
 dependencies = [
 "cfgrammar",
 "chrono",
@@ -9632,6 +9636,16 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "roaring"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41589aba99537475bf697f2118357cad1c31590c5a1b9f6d9fc4ad6d07503661"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+]
+
 [[package]]
 name = "robust"
 version = "1.1.0"
@@ -11051,7 +11065,7 @@ dependencies = [
 "serde_json",
 "sha2",
 "smallvec",
- "thiserror 2.0.6",
+ "thiserror 2.0.12",
 "tokio",
 "tokio-stream",
 "tracing",
@@ -11136,7 +11150,7 @@ dependencies = [
 "smallvec",
 "sqlx-core",
 "stringprep",
- "thiserror 2.0.6",
+ "thiserror 2.0.12",
 "tracing",
 "whoami",
 ]
@@ -11174,7 +11188,7 @@ dependencies = [
 "smallvec",
 "sqlx-core",
 "stringprep",
- "thiserror 2.0.6",
+ "thiserror 2.0.12",
 "tracing",
 "whoami",
 ]
@@ -11955,11 +11969,11 @@ dependencies = [

 [[package]]
 name = "thiserror"
-version = "2.0.6"
+version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47"
+checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
 dependencies = [
- "thiserror-impl 2.0.6",
+ "thiserror-impl 2.0.12",
 ]

 [[package]]
@@ -11975,9 +11989,9 @@ dependencies = [

 [[package]]
 name = "thiserror-impl"
-version = "2.0.6"
+version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312"
+checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
 dependencies = [
 "proc-macro2",
 "quote",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,7 +129,7 @@ etcd-client = "0.14"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "072ce580502e015df1a6b03a185b60309a7c2a7a" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "d92c9ac4e90ef4abdcf5c2eaf5a164e18ba09486" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -160,9 +160,7 @@ parquet = { version = "53.0.0", default-features = false, features = ["arrow", "
 paste = "1.0"
 pin-project = "1.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { git = "https://github.com/GreptimeTeam/promql-parser.git", features = [
-    "ser",
-], rev = "27abb8e16003a50c720f00d6c85f41f5fa2a2a8e" }
+promql-parser = { version = "0.5", features = ["ser"] }
 prost = "0.13"
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.8"
--- a/4
+++ b/4
@@ -8,7 +8,7 @@ CARGO_BUILD_OPTS := --locked
 IMAGE_REGISTRY ?= docker.io
 IMAGE_NAMESPACE ?= greptime
 IMAGE_TAG ?= latest
-DEV_BUILDER_IMAGE_TAG ?= 2024-12-25-9d0fa5d5-20250124085746
+DEV_BUILDER_IMAGE_TAG ?= 2024-12-25-a71b93dd-20250305072908
 BUILDX_MULTI_PLATFORM_BUILD ?= false
 BUILDX_BUILDER_NAME ?= gtbuilder
 BASE_IMAGE ?= ubuntu
@@ -60,6 +60,8 @@ ifeq ($(BUILDX_MULTI_PLATFORM_BUILD), all)
 	BUILDX_MULTI_PLATFORM_BUILD_OPTS := --platform linux/amd64,linux/arm64 --push
 else ifeq ($(BUILDX_MULTI_PLATFORM_BUILD), amd64)
 	BUILDX_MULTI_PLATFORM_BUILD_OPTS := --platform linux/amd64 --push
+else ifeq ($(BUILDX_MULTI_PLATFORM_BUILD), arm64)
+	BUILDX_MULTI_PLATFORM_BUILD_OPTS := --platform linux/arm64 --push
 else
 	BUILDX_MULTI_PLATFORM_BUILD_OPTS := -o type=docker
 endif
--- a/grafana/check.sh
+++ b/grafana/check.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+BASEDIR=$(dirname "$0")
+
+# Use jq to check for panels with empty or missing descriptions
+invalid_panels=$(cat $BASEDIR/greptimedb-cluster.json | jq -r '
+  .panels[]
+  | select((.type == "stats" or .type == "timeseries") and (.description == "" or .description == null))
+')
+
+# Check if any invalid panels were found
+if [[ -n "$invalid_panels" ]]; then
+  echo "Error: The following panels have empty or missing descriptions:"
+  echo "$invalid_panels"
+  exit 1
+else
+  echo "All panels with type 'stats' or 'timeseries' have valid descriptions."
+  exit 0
+fi
--- a/grafana/greptimedb-cluster.json
+++ b/grafana/greptimedb-cluster.json
--- a/grafana/summary.sh
+++ b/grafana/summary.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+BASEDIR=$(dirname "$0")
+echo '| Title | Description | Expressions |
+|---|---|---|'
+
+cat $BASEDIR/greptimedb-cluster.json | jq -r '
+  .panels |
+  map(select(.type == "stat" or .type == "timeseries")) |
+  .[] | "| \(.title) | \(.description | gsub("\n"; "<br>")) | \(.targets | map(.expr // .rawSql | "`\(.|gsub("\n"; "<br>"))`")  | join("<br>")) |"
+'
--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -16,7 +16,6 @@

 mod client;
 pub mod client_manager;
-#[cfg(feature = "testing")]
 mod database;
 pub mod error;
 pub mod flow;
@@ -34,7 +33,6 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
 use snafu::OptionExt;

 pub use self::client::Client;
-#[cfg(feature = "testing")]
 pub use self::database::Database;
 pub use self::error::{Error, Result};
 use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};
--- a/src/cmd/src/datanode.rs
+++ b/src/cmd/src/datanode.rs
@@ -287,7 +287,6 @@ impl StartCommand {
            .await
            .context(StartDatanodeSnafu)?;

-        let cluster_id = 0; // TODO(hl): read from config
        let member_id = opts
            .node_id
            .context(MissingConfigSnafu { msg: "'node_id'" })?;
@@ -296,13 +295,10 @@ impl StartCommand {
            msg: "'meta_client_options'",
        })?;

-        let meta_client = meta_client::create_meta_client(
-            cluster_id,
-            MetaClientType::Datanode { member_id },
-            meta_config,
-        )
-        .await
-        .context(MetaClientInitSnafu)?;
+        let meta_client =
+            meta_client::create_meta_client(MetaClientType::Datanode { member_id }, meta_config)
+                .await
+                .context(MetaClientInitSnafu)?;

        let meta_backend = Arc::new(MetaKvBackend {
            client: meta_client.clone(),
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -32,7 +32,7 @@ use common_meta::key::TableMetadataManager;
 use common_telemetry::info;
 use common_telemetry::logging::TracingOptions;
 use common_version::{short_version, version};
-use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
+use flow::{FlownodeBuilder, FlownodeInstance, FrontendClient, FrontendInvoker};
 use meta_client::{MetaClientOptions, MetaClientType};
 use servers::Mode;
 use snafu::{OptionExt, ResultExt};
@@ -241,9 +241,6 @@ impl StartCommand {
        let mut opts = opts.component;
        opts.grpc.detect_server_addr();

-        // TODO(discord9): make it not optionale after cluster id is required
-        let cluster_id = opts.cluster_id.unwrap_or(0);
-
        let member_id = opts
            .node_id
            .context(MissingConfigSnafu { msg: "'node_id'" })?;
@@ -252,13 +249,10 @@ impl StartCommand {
            msg: "'meta_client_options'",
        })?;

-        let meta_client = meta_client::create_meta_client(
-            cluster_id,
-            MetaClientType::Flownode { member_id },
-            meta_config,
-        )
-        .await
-        .context(MetaClientInitSnafu)?;
+        let meta_client =
+            meta_client::create_meta_client(MetaClientType::Flownode { member_id }, meta_config)
+                .await
+                .context(MetaClientInitSnafu)?;

        let cache_max_capacity = meta_config.metadata_cache_max_capacity;
        let cache_ttl = meta_config.metadata_cache_ttl;
@@ -317,6 +311,8 @@ impl StartCommand {
            Arc::new(executor),
        );

+        let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
+
        let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
        let flownode_builder = FlownodeBuilder::new(
            opts,
@@ -324,6 +320,7 @@ impl StartCommand {
            table_metadata_manager,
            catalog_manager.clone(),
            flow_metadata_manager,
+            Arc::new(frontend_client),
        )
        .with_heartbeat_task(heartbeat_task);

--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -295,14 +295,10 @@ impl StartCommand {
        let cache_ttl = meta_client_options.metadata_cache_ttl;
        let cache_tti = meta_client_options.metadata_cache_tti;

-        let cluster_id = 0; // (TODO: jeremy): It is currently a reserved field and has not been enabled.
-        let meta_client = meta_client::create_meta_client(
-            cluster_id,
-            MetaClientType::Frontend,
-            meta_client_options,
-        )
-        .await
-        .context(MetaClientInitSnafu)?;
+        let meta_client =
+            meta_client::create_meta_client(MetaClientType::Frontend, meta_client_options)
+                .await
+                .context(MetaClientInitSnafu)?;

        // TODO(discord9): add helper function to ease the creation of cache registry&such
        let cached_meta_backend =
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -54,7 +54,10 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
 use datanode::datanode::{Datanode, DatanodeBuilder};
 use datanode::region_server::RegionServer;
 use file_engine::config::EngineConfig as FileEngineConfig;
-use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
+use flow::{
+    FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendClient,
+    FrontendInvoker,
+};
 use frontend::frontend::FrontendOptions;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
@@ -533,12 +536,16 @@ impl StartCommand {
            flow: opts.flow.clone(),
            ..Default::default()
        };
+
+        let fe_server_addr = fe_opts.grpc.bind_addr.clone();
+        let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
        let flow_builder = FlownodeBuilder::new(
            flownode_options,
            plugins.clone(),
            table_metadata_manager.clone(),
            catalog_manager.clone(),
            flow_metadata_manager.clone(),
+            Arc::new(frontend_client),
        );
        let flownode = Arc::new(
            flow_builder
--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -445,10 +445,20 @@ impl Pool {

 async fn recycle_channel_in_loop(pool: Arc<Pool>, interval_secs: u64) {
    let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
-
+    // use weak ref here to prevent pool being leaked
+    let pool_weak = {
+        let weak = Arc::downgrade(&pool);
+        drop(pool);
+        weak
+    };
    loop {
        let _ = interval.tick().await;
-        pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
+        if let Some(pool) = pool_weak.upgrade() {
+            pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
+        } else {
+            // no one is using this pool, so we can also let go
+            break;
+        }
    }
 }

--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -28,7 +28,6 @@ use crate::error::{
    InvalidRoleSnafu, ParseNumSnafu, Result,
 };
 use crate::peer::Peer;
-use crate::ClusterId;

 const CLUSTER_NODE_INFO_PREFIX: &str = "__meta_cluster_node_info";

@@ -56,12 +55,9 @@ pub trait ClusterInfo {
    // TODO(jeremy): Other info, like region status, etc.
 }

-/// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-{cluster_id}-{role}-{node_id}`.
+/// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-0-{role}-{node_id}`.
 #[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
 pub struct NodeInfoKey {
-    /// The cluster id.
-    // todo(hl): remove cluster_id as it is not assigned anywhere.
-    pub cluster_id: ClusterId,
    /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
    pub role: Role,
    /// The node id.
@@ -84,24 +80,15 @@ impl NodeInfoKey {
            _ => peer.id,
        };

-        Some(NodeInfoKey {
-            cluster_id: header.cluster_id,
-            role,
-            node_id,
-        })
+        Some(NodeInfoKey { role, node_id })
    }

-    pub fn key_prefix_with_cluster_id(cluster_id: u64) -> String {
-        format!("{}-{}-", CLUSTER_NODE_INFO_PREFIX, cluster_id)
+    pub fn key_prefix() -> String {
+        format!("{}-0-", CLUSTER_NODE_INFO_PREFIX)
    }

-    pub fn key_prefix_with_role(cluster_id: ClusterId, role: Role) -> String {
-        format!(
-            "{}-{}-{}-",
-            CLUSTER_NODE_INFO_PREFIX,
-            cluster_id,
-            i32::from(role)
-        )
+    pub fn key_prefix_with_role(role: Role) -> String {
+        format!("{}-0-{}-", CLUSTER_NODE_INFO_PREFIX, i32::from(role))
    }
 }

@@ -193,15 +180,10 @@ impl FromStr for NodeInfoKey {
        let caps = CLUSTER_NODE_INFO_PREFIX_PATTERN
            .captures(key)
            .context(InvalidNodeInfoKeySnafu { key })?;
-
        ensure!(caps.len() == 4, InvalidNodeInfoKeySnafu { key });

-        let cluster_id = caps[1].to_string();
        let role = caps[2].to_string();
        let node_id = caps[3].to_string();
-        let cluster_id: u64 = cluster_id.parse().context(ParseNumSnafu {
-            err_msg: format!("invalid cluster_id: {cluster_id}"),
-        })?;
        let role: i32 = role.parse().context(ParseNumSnafu {
            err_msg: format!("invalid role {role}"),
        })?;
@@ -210,11 +192,7 @@ impl FromStr for NodeInfoKey {
            err_msg: format!("invalid node_id: {node_id}"),
        })?;

-        Ok(Self {
-            cluster_id,
-            role,
-            node_id,
-        })
+        Ok(Self { role, node_id })
    }
 }

@@ -233,9 +211,8 @@ impl TryFrom<Vec<u8>> for NodeInfoKey {
 impl From<&NodeInfoKey> for Vec<u8> {
    fn from(key: &NodeInfoKey) -> Self {
        format!(
-            "{}-{}-{}-{}",
+            "{}-0-{}-{}",
            CLUSTER_NODE_INFO_PREFIX,
-            key.cluster_id,
            i32::from(key.role),
            key.node_id
        )
@@ -308,7 +285,6 @@ mod tests {
    #[test]
    fn test_node_info_key_round_trip() {
        let key = NodeInfoKey {
-            cluster_id: 1,
            role: Datanode,
            node_id: 2,
        };
@@ -316,7 +292,6 @@ mod tests {
        let key_bytes: Vec<u8> = (&key).into();
        let new_key: NodeInfoKey = key_bytes.try_into().unwrap();

-        assert_eq!(1, new_key.cluster_id);
        assert_eq!(Datanode, new_key.role);
        assert_eq!(2, new_key.node_id);
    }
@@ -362,11 +337,11 @@ mod tests {

    #[test]
    fn test_node_info_key_prefix() {
-        let prefix = NodeInfoKey::key_prefix_with_cluster_id(1);
-        assert_eq!(prefix, "__meta_cluster_node_info-1-");
+        let prefix = NodeInfoKey::key_prefix();
+        assert_eq!(prefix, "__meta_cluster_node_info-0-");

-        let prefix = NodeInfoKey::key_prefix_with_role(2, Frontend);
-        assert_eq!(prefix, "__meta_cluster_node_info-2-1-");
+        let prefix = NodeInfoKey::key_prefix_with_role(Frontend);
+        assert_eq!(prefix, "__meta_cluster_node_info-0-1-");
    }

    #[test]
--- a/src/common/meta/src/datanode.rs
+++ b/src/common/meta/src/datanode.rs
@@ -25,8 +25,8 @@ use store_api::region_engine::{RegionRole, RegionStatistic};
 use store_api::storage::RegionId;
 use table::metadata::TableId;

+use crate::error;
 use crate::error::Result;
-use crate::{error, ClusterId};

 pub(crate) const DATANODE_LEASE_PREFIX: &str = "__meta_datanode_lease";
 const INACTIVE_REGION_PREFIX: &str = "__meta_inactive_region";
@@ -48,11 +48,10 @@ lazy_static! {

 /// The key of the datanode stat in the storage.
 ///
-/// The format is `__meta_datanode_stat-{cluster_id}-{node_id}`.
+/// The format is `__meta_datanode_stat-0-{node_id}`.
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct Stat {
    pub timestamp_millis: i64,
-    pub cluster_id: ClusterId,
    // The datanode Id.
    pub id: u64,
    // The datanode address.
@@ -102,10 +101,7 @@ impl Stat {
    }

    pub fn stat_key(&self) -> DatanodeStatKey {
-        DatanodeStatKey {
-            cluster_id: self.cluster_id,
-            node_id: self.id,
-        }
+        DatanodeStatKey { node_id: self.id }
    }

    /// Returns a tuple array containing [RegionId] and [RegionRole].
@@ -145,7 +141,7 @@ impl TryFrom<&HeartbeatRequest> for Stat {
        } = value;

        match (header, peer) {
-            (Some(header), Some(peer)) => {
+            (Some(_header), Some(peer)) => {
                let region_stats = region_stats
                    .iter()
                    .map(RegionStat::from)
@@ -153,7 +149,6 @@ impl TryFrom<&HeartbeatRequest> for Stat {

                Ok(Self {
                    timestamp_millis: time_util::current_time_millis(),
-                    cluster_id: header.cluster_id,
                    // datanode id
                    id: peer.id,
                    // datanode address
@@ -196,32 +191,24 @@ impl From<&api::v1::meta::RegionStat> for RegionStat {

 /// The key of the datanode stat in the memory store.
 ///
-/// The format is `__meta_datanode_stat-{cluster_id}-{node_id}`.
+/// The format is `__meta_datanode_stat-0-{node_id}`.
 #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
 pub struct DatanodeStatKey {
-    pub cluster_id: ClusterId,
    pub node_id: u64,
 }

 impl DatanodeStatKey {
    /// The key prefix.
    pub fn prefix_key() -> Vec<u8> {
-        format!("{DATANODE_STAT_PREFIX}-").into_bytes()
-    }
-
-    /// The key prefix with the cluster id.
-    pub fn key_prefix_with_cluster_id(cluster_id: ClusterId) -> String {
-        format!("{DATANODE_STAT_PREFIX}-{cluster_id}-")
+        // todo(hl): remove cluster id in prefix
+        format!("{DATANODE_STAT_PREFIX}-0-").into_bytes()
    }
 }

 impl From<DatanodeStatKey> for Vec<u8> {
    fn from(value: DatanodeStatKey) -> Self {
-        format!(
-            "{}-{}-{}",
-            DATANODE_STAT_PREFIX, value.cluster_id, value.node_id
-        )
-        .into_bytes()
+        // todo(hl): remove cluster id in prefix
+        format!("{}-0-{}", DATANODE_STAT_PREFIX, value.node_id).into_bytes()
    }
 }

@@ -234,20 +221,12 @@ impl FromStr for DatanodeStatKey {
            .context(error::InvalidStatKeySnafu { key })?;

        ensure!(caps.len() == 3, error::InvalidStatKeySnafu { key });
-
-        let cluster_id = caps[1].to_string();
        let node_id = caps[2].to_string();
-        let cluster_id: u64 = cluster_id.parse().context(error::ParseNumSnafu {
-            err_msg: format!("invalid cluster_id: {cluster_id}"),
-        })?;
        let node_id: u64 = node_id.parse().context(error::ParseNumSnafu {
            err_msg: format!("invalid node_id: {node_id}"),
        })?;

-        Ok(Self {
-            cluster_id,
-            node_id,
-        })
+        Ok(Self { node_id })
    }
 }

@@ -321,7 +300,6 @@ mod tests {
    #[test]
    fn test_stat_key() {
        let stat = Stat {
-            cluster_id: 3,
            id: 101,
            region_num: 10,
            ..Default::default()
@@ -329,14 +307,12 @@ mod tests {

        let stat_key = stat.stat_key();

-        assert_eq!(3, stat_key.cluster_id);
        assert_eq!(101, stat_key.node_id);
    }

    #[test]
    fn test_stat_val_round_trip() {
        let stat = Stat {
-            cluster_id: 0,
            id: 101,
            region_num: 100,
            ..Default::default()
@@ -351,7 +327,6 @@ mod tests {
        assert_eq!(1, stats.len());

        let stat = stats.first().unwrap();
-        assert_eq!(0, stat.cluster_id);
        assert_eq!(101, stat.id);
        assert_eq!(100, stat.region_num);
    }
--- a/src/common/meta/src/ddl.rs
+++ b/src/common/meta/src/ddl.rs
@@ -30,7 +30,7 @@ use crate::node_manager::NodeManagerRef;
 use crate::region_keeper::MemoryRegionKeeperRef;
 use crate::rpc::ddl::{SubmitDdlTaskRequest, SubmitDdlTaskResponse};
 use crate::rpc::procedure::{MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse};
-use crate::{ClusterId, DatanodeId};
+use crate::DatanodeId;

 pub mod alter_database;
 pub mod alter_logical_tables;
@@ -57,7 +57,6 @@ pub mod utils;

 #[derive(Debug, Default)]
 pub struct ExecutorContext {
-    pub cluster_id: Option<u64>,
    pub tracing_context: Option<W3cTrace>,
 }

@@ -90,10 +89,6 @@ pub trait ProcedureExecutor: Send + Sync {

 pub type ProcedureExecutorRef = Arc<dyn ProcedureExecutor>;

-pub struct TableMetadataAllocatorContext {
-    pub cluster_id: ClusterId,
-}
-
 /// Metadata allocated to a table.
 #[derive(Default)]
 pub struct TableMetadata {
@@ -108,7 +103,7 @@ pub struct TableMetadata {

 pub type RegionFailureDetectorControllerRef = Arc<dyn RegionFailureDetectorController>;

-pub type DetectingRegion = (ClusterId, DatanodeId, RegionId);
+pub type DetectingRegion = (DatanodeId, RegionId);

 /// Used for actively registering Region failure detectors.
 ///
--- a/src/common/meta/src/ddl/alter_database.rs
+++ b/src/common/meta/src/ddl/alter_database.rs
@@ -30,7 +30,6 @@ use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock};
 use crate::rpc::ddl::UnsetDatabaseOption::{self};
 use crate::rpc::ddl::{AlterDatabaseKind, AlterDatabaseTask, SetDatabaseOption};
-use crate::ClusterId;

 pub struct AlterDatabaseProcedure {
    pub context: DdlContext,
@@ -65,14 +64,10 @@ fn build_new_schema_value(
 impl AlterDatabaseProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::AlterDatabase";

-    pub fn new(
-        cluster_id: ClusterId,
-        task: AlterDatabaseTask,
-        context: DdlContext,
-    ) -> Result<Self> {
+    pub fn new(task: AlterDatabaseTask, context: DdlContext) -> Result<Self> {
        Ok(Self {
            context,
-            data: AlterDatabaseData::new(task, cluster_id)?,
+            data: AlterDatabaseData::new(task)?,
        })
    }

@@ -183,7 +178,6 @@ enum AlterDatabaseState {
 /// The data of alter database procedure.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AlterDatabaseData {
-    cluster_id: ClusterId,
    state: AlterDatabaseState,
    kind: AlterDatabaseKind,
    catalog_name: String,
@@ -192,9 +186,8 @@ pub struct AlterDatabaseData {
 }

 impl AlterDatabaseData {
-    pub fn new(task: AlterDatabaseTask, cluster_id: ClusterId) -> Result<Self> {
+    pub fn new(task: AlterDatabaseTask) -> Result<Self> {
        Ok(Self {
-            cluster_id,
            state: AlterDatabaseState::Prepare,
            kind: AlterDatabaseKind::try_from(task.alter_expr.kind.unwrap())?,
            catalog_name: task.alter_expr.catalog_name,
--- a/src/common/meta/src/ddl/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables.rs
@@ -37,9 +37,9 @@ use crate::key::table_info::TableInfoValue;
 use crate::key::table_route::PhysicalTableRouteValue;
 use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
+use crate::metrics;
 use crate::rpc::ddl::AlterTableTask;
 use crate::rpc::router::find_leaders;
-use crate::{metrics, ClusterId};

 pub struct AlterLogicalTablesProcedure {
    pub context: DdlContext,
@@ -50,7 +50,6 @@ impl AlterLogicalTablesProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::AlterLogicalTables";

    pub fn new(
-        cluster_id: ClusterId,
        tasks: Vec<AlterTableTask>,
        physical_table_id: TableId,
        context: DdlContext,
@@ -58,7 +57,6 @@ impl AlterLogicalTablesProcedure {
        Self {
            context,
            data: AlterTablesData {
-                cluster_id,
                state: AlterTablesState::Prepare,
                tasks,
                table_info_values: vec![],
@@ -240,7 +238,6 @@ impl Procedure for AlterLogicalTablesProcedure {

 #[derive(Debug, Serialize, Deserialize)]
 pub struct AlterTablesData {
-    cluster_id: ClusterId,
    state: AlterTablesState,
    tasks: Vec<AlterTableTask>,
    /// Table info values before the alter operation.
--- a/src/common/meta/src/ddl/alter_table.rs
+++ b/src/common/meta/src/ddl/alter_table.rs
@@ -45,9 +45,9 @@ use crate::instruction::CacheIdent;
 use crate::key::table_info::TableInfoValue;
 use crate::key::{DeserializedValueWithBytes, RegionDistribution};
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
+use crate::metrics;
 use crate::rpc::ddl::AlterTableTask;
 use crate::rpc::router::{find_leader_regions, find_leaders, region_distribution};
-use crate::{metrics, ClusterId};

 /// The alter table procedure
 pub struct AlterTableProcedure {
@@ -64,16 +64,11 @@ pub struct AlterTableProcedure {
 impl AlterTableProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::AlterTable";

-    pub fn new(
-        cluster_id: ClusterId,
-        table_id: TableId,
-        task: AlterTableTask,
-        context: DdlContext,
-    ) -> Result<Self> {
+    pub fn new(table_id: TableId, task: AlterTableTask, context: DdlContext) -> Result<Self> {
        task.validate()?;
        Ok(Self {
            context,
-            data: AlterTableData::new(task, table_id, cluster_id),
+            data: AlterTableData::new(task, table_id),
            new_table_info: None,
        })
    }
@@ -307,7 +302,6 @@ enum AlterTableState {
 // The serialized data of alter table.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AlterTableData {
-    cluster_id: ClusterId,
    state: AlterTableState,
    task: AlterTableTask,
    table_id: TableId,
@@ -318,12 +312,11 @@ pub struct AlterTableData {
 }

 impl AlterTableData {
-    pub fn new(task: AlterTableTask, table_id: TableId, cluster_id: u64) -> Self {
+    pub fn new(task: AlterTableTask, table_id: TableId) -> Self {
        Self {
            state: AlterTableState::Prepare,
            task,
            table_id,
-            cluster_id,
            table_info_value: None,
            region_distribution: None,
        }
--- a/src/common/meta/src/ddl/alter_table/region_request.rs
+++ b/src/common/meta/src/ddl/alter_table/region_request.rs
@@ -167,10 +167,9 @@ mod tests {
    use crate::test_util::{new_ddl_context, MockDatanodeManager};

    /// Prepares a region with schema `[ts: Timestamp, host: Tag, cpu: Field]`.
-    async fn prepare_ddl_context() -> (DdlContext, u64, TableId, RegionId, String) {
+    async fn prepare_ddl_context() -> (DdlContext, TableId, RegionId, String) {
        let datanode_manager = Arc::new(MockDatanodeManager::new(()));
        let ddl_context = new_ddl_context(datanode_manager);
-        let cluster_id = 1;
        let table_id = 1024;
        let region_id = RegionId::new(table_id, 1);
        let table_name = "foo";
@@ -225,19 +224,12 @@ mod tests {
            )
            .await
            .unwrap();
-        (
-            ddl_context,
-            cluster_id,
-            table_id,
-            region_id,
-            table_name.to_string(),
-        )
+        (ddl_context, table_id, region_id, table_name.to_string())
    }

    #[tokio::test]
    async fn test_make_alter_region_request() {
-        let (ddl_context, cluster_id, table_id, region_id, table_name) =
-            prepare_ddl_context().await;
+        let (ddl_context, table_id, region_id, table_name) = prepare_ddl_context().await;

        let task = AlterTableTask {
            alter_table: AlterTableExpr {
@@ -265,8 +257,7 @@ mod tests {
            },
        };

-        let mut procedure =
-            AlterTableProcedure::new(cluster_id, table_id, task, ddl_context).unwrap();
+        let mut procedure = AlterTableProcedure::new(table_id, task, ddl_context).unwrap();
        procedure.on_prepare().await.unwrap();
        let alter_kind = procedure.make_region_alter_kind().unwrap();
        let Some(Body::Alter(alter_region_request)) = procedure
@@ -307,8 +298,7 @@ mod tests {

    #[tokio::test]
    async fn test_make_alter_column_type_region_request() {
-        let (ddl_context, cluster_id, table_id, region_id, table_name) =
-            prepare_ddl_context().await;
+        let (ddl_context, table_id, region_id, table_name) = prepare_ddl_context().await;

        let task = AlterTableTask {
            alter_table: AlterTableExpr {
@@ -325,8 +315,7 @@ mod tests {
            },
        };

-        let mut procedure =
-            AlterTableProcedure::new(cluster_id, table_id, task, ddl_context).unwrap();
+        let mut procedure = AlterTableProcedure::new(table_id, task, ddl_context).unwrap();
        procedure.on_prepare().await.unwrap();
        let alter_kind = procedure.make_region_alter_kind().unwrap();
        let Some(Body::Alter(alter_region_request)) = procedure
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -46,9 +46,9 @@ use crate::key::flow::flow_route::FlowRouteValue;
 use crate::key::table_name::TableNameKey;
 use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId};
 use crate::lock_key::{CatalogLock, FlowNameLock, TableNameLock};
+use crate::metrics;
 use crate::peer::Peer;
 use crate::rpc::ddl::{CreateFlowTask, QueryContext};
-use crate::{metrics, ClusterId};

 /// The procedure of flow creation.
 pub struct CreateFlowProcedure {
@@ -60,16 +60,10 @@ impl CreateFlowProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::CreateFlow";

    /// Returns a new [CreateFlowProcedure].
-    pub fn new(
-        cluster_id: ClusterId,
-        task: CreateFlowTask,
-        query_context: QueryContext,
-        context: DdlContext,
-    ) -> Self {
+    pub fn new(task: CreateFlowTask, query_context: QueryContext, context: DdlContext) -> Self {
        Self {
            context,
            data: CreateFlowData {
-                cluster_id,
                task,
                flow_id: None,
                peers: vec![],
@@ -343,6 +337,7 @@ pub enum FlowType {
 impl FlowType {
    pub const RECORDING_RULE: &str = "recording_rule";
    pub const STREAMING: &str = "streaming";
+    pub const FLOW_TYPE_KEY: &str = "flow_type";
 }

 impl Default for FlowType {
@@ -363,7 +358,6 @@ impl fmt::Display for FlowType {
 /// The serializable data.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct CreateFlowData {
-    pub(crate) cluster_id: ClusterId,
    pub(crate) state: CreateFlowState,
    pub(crate) task: CreateFlowTask,
    pub(crate) flow_id: Option<FlowId>,
@@ -398,7 +392,8 @@ impl From<&CreateFlowData> for CreateRequest {
        };

        let flow_type = value.flow_type.unwrap_or_default().to_string();
-        req.flow_options.insert("flow_type".to_string(), flow_type);
+        req.flow_options
+            .insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
        req
    }
 }
@@ -430,7 +425,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            .collect::<Vec<_>>();

        let flow_type = value.flow_type.unwrap_or_default().to_string();
-        options.insert("flow_type".to_string(), flow_type);
+        options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);

        let flow_info = FlowInfoValue {
            source_table_ids: value.source_table_ids.clone(),
--- a/src/common/meta/src/ddl/create_flow/metadata.rs
+++ b/src/common/meta/src/ddl/create_flow/metadata.rs
@@ -23,11 +23,10 @@ impl CreateFlowProcedure {
    pub(crate) async fn allocate_flow_id(&mut self) -> Result<()> {
        //TODO(weny, ruihang): We doesn't support the partitions. It's always be 1, now.
        let partitions = 1;
-        let cluster_id = self.data.cluster_id;
        let (flow_id, peers) = self
            .context
            .flow_metadata_allocator
-            .create(cluster_id, partitions)
+            .create(partitions)
            .await?;
        self.data.flow_id = Some(flow_id);
        self.data.peers = peers;
--- a/src/common/meta/src/ddl/create_logical_tables.rs
+++ b/src/common/meta/src/ddl/create_logical_tables.rs
@@ -36,9 +36,9 @@ use crate::ddl::DdlContext;
 use crate::error::{DecodeJsonSnafu, MetadataCorruptionSnafu, Result};
 use crate::key::table_route::TableRouteValue;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
+use crate::metrics;
 use crate::rpc::ddl::CreateTableTask;
 use crate::rpc::router::{find_leaders, RegionRoute};
-use crate::{metrics, ClusterId};

 pub struct CreateLogicalTablesProcedure {
    pub context: DdlContext,
@@ -49,7 +49,6 @@ impl CreateLogicalTablesProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::CreateLogicalTables";

    pub fn new(
-        cluster_id: ClusterId,
        tasks: Vec<CreateTableTask>,
        physical_table_id: TableId,
        context: DdlContext,
@@ -57,7 +56,6 @@ impl CreateLogicalTablesProcedure {
        Self {
            context,
            data: CreateTablesData {
-                cluster_id,
                state: CreateTablesState::Prepare,
                tasks,
                table_ids_already_exists: vec![],
@@ -245,7 +243,6 @@ impl Procedure for CreateLogicalTablesProcedure {

 #[derive(Debug, Serialize, Deserialize)]
 pub struct CreateTablesData {
-    cluster_id: ClusterId,
    state: CreateTablesState,
    tasks: Vec<CreateTableTask>,
    table_ids_already_exists: Vec<Option<TableId>>,
--- a/src/common/meta/src/ddl/create_table.rs
+++ b/src/common/meta/src/ddl/create_table.rs
@@ -37,17 +37,17 @@ use crate::ddl::utils::{
    add_peer_context_if_needed, convert_region_routes_to_detecting_regions, handle_retry_error,
    region_storage_path,
 };
-use crate::ddl::{DdlContext, TableMetadata, TableMetadataAllocatorContext};
+use crate::ddl::{DdlContext, TableMetadata};
 use crate::error::{self, Result};
 use crate::key::table_name::TableNameKey;
 use crate::key::table_route::{PhysicalTableRouteValue, TableRouteValue};
 use crate::lock_key::{CatalogLock, SchemaLock, TableNameLock};
+use crate::metrics;
 use crate::region_keeper::OperatingRegionGuard;
 use crate::rpc::ddl::CreateTableTask;
 use crate::rpc::router::{
    find_leader_regions, find_leaders, operating_leader_regions, RegionRoute,
 };
-use crate::{metrics, ClusterId};
 pub struct CreateTableProcedure {
    pub context: DdlContext,
    pub creator: TableCreator,
@@ -56,10 +56,10 @@ pub struct CreateTableProcedure {
 impl CreateTableProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::CreateTable";

-    pub fn new(cluster_id: ClusterId, task: CreateTableTask, context: DdlContext) -> Self {
+    pub fn new(task: CreateTableTask, context: DdlContext) -> Self {
        Self {
            context,
-            creator: TableCreator::new(cluster_id, task),
+            creator: TableCreator::new(task),
        }
    }

@@ -154,12 +154,7 @@ impl CreateTableProcedure {
        } = self
            .context
            .table_metadata_allocator
-            .create(
-                &TableMetadataAllocatorContext {
-                    cluster_id: self.creator.data.cluster_id,
-                },
-                &self.creator.data.task,
-            )
+            .create(&self.creator.data.task)
            .await?;
        self.creator
            .set_allocated_metadata(table_id, table_route, region_wal_options);
@@ -268,7 +263,6 @@ impl CreateTableProcedure {
    /// - Failed to create table metadata.
    async fn on_create_metadata(&mut self) -> Result<Status> {
        let table_id = self.table_id();
-        let cluster_id = self.creator.data.cluster_id;
        let manager = &self.context.table_metadata_manager;

        let raw_table_info = self.table_info().clone();
@@ -276,10 +270,8 @@ impl CreateTableProcedure {
        let region_wal_options = self.region_wal_options()?.clone();
        // Safety: the table_route must be allocated.
        let physical_table_route = self.table_route()?.clone();
-        let detecting_regions = convert_region_routes_to_detecting_regions(
-            cluster_id,
-            &physical_table_route.region_routes,
-        );
+        let detecting_regions =
+            convert_region_routes_to_detecting_regions(&physical_table_route.region_routes);
        let table_route = TableRouteValue::Physical(physical_table_route);
        manager
            .create_table_metadata(raw_table_info, table_route, region_wal_options)
@@ -351,11 +343,10 @@ pub struct TableCreator {
 }

 impl TableCreator {
-    pub fn new(cluster_id: ClusterId, task: CreateTableTask) -> Self {
+    pub fn new(task: CreateTableTask) -> Self {
        Self {
            data: CreateTableData {
                state: CreateTableState::Prepare,
-                cluster_id,
                task,
                table_route: None,
                region_wal_options: None,
@@ -421,7 +412,6 @@ pub struct CreateTableData {
    table_route: Option<PhysicalTableRouteValue>,
    /// None stands for not allocated yet.
    pub region_wal_options: Option<HashMap<RegionNumber, String>>,
-    pub cluster_id: ClusterId,
 }

 impl CreateTableData {
--- a/src/common/meta/src/ddl/create_view.rs
+++ b/src/common/meta/src/ddl/create_view.rs
@@ -24,13 +24,13 @@ use table::table_reference::TableReference;

 use crate::cache_invalidator::Context;
 use crate::ddl::utils::handle_retry_error;
-use crate::ddl::{DdlContext, TableMetadata, TableMetadataAllocatorContext};
+use crate::ddl::{DdlContext, TableMetadata};
 use crate::error::{self, Result};
 use crate::instruction::CacheIdent;
 use crate::key::table_name::TableNameKey;
 use crate::lock_key::{CatalogLock, SchemaLock, TableNameLock};
+use crate::metrics;
 use crate::rpc::ddl::CreateViewTask;
-use crate::{metrics, ClusterId};

 // The procedure to execute `[CreateViewTask]`.
 pub struct CreateViewProcedure {
@@ -41,12 +41,11 @@ pub struct CreateViewProcedure {
 impl CreateViewProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::CreateView";

-    pub fn new(cluster_id: ClusterId, task: CreateViewTask, context: DdlContext) -> Self {
+    pub fn new(task: CreateViewTask, context: DdlContext) -> Self {
        Self {
            context,
            data: CreateViewData {
                state: CreateViewState::Prepare,
-                cluster_id,
                task,
                need_update: false,
            },
@@ -144,12 +143,7 @@ impl CreateViewProcedure {
            let TableMetadata { table_id, .. } = self
                .context
                .table_metadata_allocator
-                .create_view(
-                    &TableMetadataAllocatorContext {
-                        cluster_id: self.data.cluster_id,
-                    },
-                    &None,
-                )
+                .create_view(&None)
                .await?;
            self.data.set_allocated_metadata(table_id, false);
        }
@@ -285,7 +279,6 @@ pub enum CreateViewState {
 pub struct CreateViewData {
    pub state: CreateViewState,
    pub task: CreateViewTask,
-    pub cluster_id: ClusterId,
    /// Whether to update the view info.
    pub need_update: bool,
 }
--- a/src/common/meta/src/ddl/drop_database.rs
+++ b/src/common/meta/src/ddl/drop_database.rs
@@ -35,7 +35,6 @@ use crate::ddl::DdlContext;
 use crate::error::Result;
 use crate::key::table_name::TableNameValue;
 use crate::lock_key::{CatalogLock, SchemaLock};
-use crate::ClusterId;

 pub struct DropDatabaseProcedure {
    /// The context of procedure runtime.
@@ -54,7 +53,6 @@ pub(crate) enum DropTableTarget {

 /// Context of [DropDatabaseProcedure] execution.
 pub(crate) struct DropDatabaseContext {
-    cluster_id: ClusterId,
    catalog: String,
    schema: String,
    drop_if_exists: bool,
@@ -87,7 +85,6 @@ impl DropDatabaseProcedure {
        Self {
            runtime_context: context,
            context: DropDatabaseContext {
-                cluster_id: 0,
                catalog,
                schema,
                drop_if_exists,
@@ -108,7 +105,6 @@ impl DropDatabaseProcedure {
        Ok(Self {
            runtime_context,
            context: DropDatabaseContext {
-                cluster_id: 0,
                catalog,
                schema,
                drop_if_exists,
--- a/src/common/meta/src/ddl/drop_database/cursor.rs
+++ b/src/common/meta/src/ddl/drop_database/cursor.rs
@@ -217,11 +217,10 @@ mod tests {
    async fn test_next_without_logical_tables() {
        let node_manager = Arc::new(MockDatanodeManager::new(()));
        let ddl_context = new_ddl_context(node_manager);
-        create_physical_table(&ddl_context, 0, "phy").await;
+        create_physical_table(&ddl_context, "phy").await;
        // It always starts from Logical
        let mut state = DropDatabaseCursor::new(DropTableTarget::Logical);
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: DEFAULT_CATALOG_NAME.to_string(),
            schema: DEFAULT_SCHEMA_NAME.to_string(),
            drop_if_exists: false,
@@ -252,12 +251,11 @@ mod tests {
    async fn test_next_with_logical_tables() {
        let node_manager = Arc::new(MockDatanodeManager::new(()));
        let ddl_context = new_ddl_context(node_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
-        create_logical_table(ddl_context.clone(), 0, physical_table_id, "metric_0").await;
+        let physical_table_id = create_physical_table(&ddl_context, "phy").await;
+        create_logical_table(ddl_context.clone(), physical_table_id, "metric_0").await;
        // It always starts from Logical
        let mut state = DropDatabaseCursor::new(DropTableTarget::Logical);
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: DEFAULT_CATALOG_NAME.to_string(),
            schema: DEFAULT_SCHEMA_NAME.to_string(),
            drop_if_exists: false,
@@ -286,7 +284,6 @@ mod tests {
        let ddl_context = new_ddl_context(node_manager);
        let mut state = DropDatabaseCursor::new(DropTableTarget::Physical);
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: DEFAULT_CATALOG_NAME.to_string(),
            schema: DEFAULT_SCHEMA_NAME.to_string(),
            drop_if_exists: false,
--- a/src/common/meta/src/ddl/drop_database/executor.rs
+++ b/src/common/meta/src/ddl/drop_database/executor.rs
@@ -98,11 +98,10 @@ impl State for DropDatabaseExecutor {
    async fn next(
        &mut self,
        ddl_ctx: &DdlContext,
-        ctx: &mut DropDatabaseContext,
+        _ctx: &mut DropDatabaseContext,
    ) -> Result<(Box<dyn State>, Status)> {
        self.register_dropping_regions(ddl_ctx)?;
-        let executor =
-            DropTableExecutor::new(ctx.cluster_id, self.table_name.clone(), self.table_id, true);
+        let executor = DropTableExecutor::new(self.table_name.clone(), self.table_id, true);
        // Deletes metadata for table permanently.
        let table_route_value = TableRouteValue::new(
            self.table_id,
@@ -187,7 +186,7 @@ mod tests {
    async fn test_next_with_physical_table() {
        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
        let ddl_context = new_ddl_context(node_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
+        let physical_table_id = create_physical_table(&ddl_context, "phy").await;
        let (_, table_route) = ddl_context
            .table_metadata_manager
            .table_route_manager()
@@ -203,7 +202,6 @@ mod tests {
                DropTableTarget::Physical,
            );
            let mut ctx = DropDatabaseContext {
-                cluster_id: 0,
                catalog: DEFAULT_CATALOG_NAME.to_string(),
                schema: DEFAULT_SCHEMA_NAME.to_string(),
                drop_if_exists: false,
@@ -216,7 +214,6 @@ mod tests {
        }
        // Execute again
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: DEFAULT_CATALOG_NAME.to_string(),
            schema: DEFAULT_SCHEMA_NAME.to_string(),
            drop_if_exists: false,
@@ -239,8 +236,8 @@ mod tests {
    async fn test_next_logical_table() {
        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
        let ddl_context = new_ddl_context(node_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
-        create_logical_table(ddl_context.clone(), 0, physical_table_id, "metric").await;
+        let physical_table_id = create_physical_table(&ddl_context, "phy").await;
+        create_logical_table(ddl_context.clone(), physical_table_id, "metric").await;
        let logical_table_id = physical_table_id + 1;
        let (_, table_route) = ddl_context
            .table_metadata_manager
@@ -257,7 +254,6 @@ mod tests {
                DropTableTarget::Logical,
            );
            let mut ctx = DropDatabaseContext {
-                cluster_id: 0,
                catalog: DEFAULT_CATALOG_NAME.to_string(),
                schema: DEFAULT_SCHEMA_NAME.to_string(),
                drop_if_exists: false,
@@ -270,7 +266,6 @@ mod tests {
        }
        // Execute again
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: DEFAULT_CATALOG_NAME.to_string(),
            schema: DEFAULT_SCHEMA_NAME.to_string(),
            drop_if_exists: false,
@@ -345,7 +340,7 @@ mod tests {
    async fn test_next_retryable_err() {
        let node_manager = Arc::new(MockDatanodeManager::new(RetryErrorDatanodeHandler));
        let ddl_context = new_ddl_context(node_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
+        let physical_table_id = create_physical_table(&ddl_context, "phy").await;
        let (_, table_route) = ddl_context
            .table_metadata_manager
            .table_route_manager()
@@ -360,7 +355,6 @@ mod tests {
            DropTableTarget::Physical,
        );
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: DEFAULT_CATALOG_NAME.to_string(),
            schema: DEFAULT_SCHEMA_NAME.to_string(),
            drop_if_exists: false,
@@ -374,7 +368,7 @@ mod tests {
    async fn test_on_recovery() {
        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
        let ddl_context = new_ddl_context(node_manager);
-        let physical_table_id = create_physical_table(&ddl_context, 0, "phy").await;
+        let physical_table_id = create_physical_table(&ddl_context, "phy").await;
        let (_, table_route) = ddl_context
            .table_metadata_manager
            .table_route_manager()
@@ -390,7 +384,6 @@ mod tests {
                DropTableTarget::Physical,
            );
            let mut ctx = DropDatabaseContext {
-                cluster_id: 0,
                catalog: DEFAULT_CATALOG_NAME.to_string(),
                schema: DEFAULT_SCHEMA_NAME.to_string(),
                drop_if_exists: false,
--- a/src/common/meta/src/ddl/drop_database/metadata.rs
+++ b/src/common/meta/src/ddl/drop_database/metadata.rs
@@ -118,7 +118,6 @@ mod tests {
            .unwrap();
        let mut state = DropDatabaseRemoveMetadata;
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: "foo".to_string(),
            schema: "bar".to_string(),
            drop_if_exists: true,
@@ -145,7 +144,6 @@ mod tests {
        // Schema not exists
        let mut state = DropDatabaseRemoveMetadata;
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: "foo".to_string(),
            schema: "bar".to_string(),
            drop_if_exists: true,
--- a/src/common/meta/src/ddl/drop_database/start.rs
+++ b/src/common/meta/src/ddl/drop_database/start.rs
@@ -89,7 +89,6 @@ mod tests {
        let ddl_context = new_ddl_context(node_manager);
        let mut step = DropDatabaseStart;
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: "foo".to_string(),
            schema: "bar".to_string(),
            drop_if_exists: false,
@@ -105,7 +104,6 @@ mod tests {
        let ddl_context = new_ddl_context(node_manager);
        let mut state = DropDatabaseStart;
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: "foo".to_string(),
            schema: "bar".to_string(),
            drop_if_exists: true,
@@ -128,7 +126,6 @@ mod tests {
            .unwrap();
        let mut state = DropDatabaseStart;
        let mut ctx = DropDatabaseContext {
-            cluster_id: 0,
            catalog: "foo".to_string(),
            schema: "bar".to_string(),
            drop_if_exists: false,
--- a/src/common/meta/src/ddl/drop_flow.rs
+++ b/src/common/meta/src/ddl/drop_flow.rs
@@ -37,8 +37,8 @@ use crate::instruction::{CacheIdent, DropFlow};
 use crate::key::flow::flow_info::FlowInfoValue;
 use crate::key::flow::flow_route::FlowRouteValue;
 use crate::lock_key::{CatalogLock, FlowLock};
+use crate::metrics;
 use crate::rpc::ddl::DropFlowTask;
-use crate::{metrics, ClusterId};

 /// The procedure for dropping a flow.
 pub struct DropFlowProcedure {
@@ -51,12 +51,11 @@ pub struct DropFlowProcedure {
 impl DropFlowProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::DropFlow";

-    pub fn new(cluster_id: ClusterId, task: DropFlowTask, context: DdlContext) -> Self {
+    pub fn new(task: DropFlowTask, context: DdlContext) -> Self {
        Self {
            context,
            data: DropFlowData {
                state: DropFlowState::Prepare,
-                cluster_id,
                task,
                flow_info_value: None,
                flow_route_values: vec![],
@@ -218,7 +217,6 @@ impl Procedure for DropFlowProcedure {
 #[derive(Debug, Serialize, Deserialize)]
 pub(crate) struct DropFlowData {
    state: DropFlowState,
-    cluster_id: ClusterId,
    task: DropFlowTask,
    pub(crate) flow_info_value: Option<FlowInfoValue>,
    pub(crate) flow_route_values: Vec<FlowRouteValue>,
--- a/src/common/meta/src/ddl/drop_table.rs
+++ b/src/common/meta/src/ddl/drop_table.rs
@@ -40,10 +40,10 @@ use crate::ddl::DdlContext;
 use crate::error::{self, Result};
 use crate::key::table_route::TableRouteValue;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
+use crate::metrics;
 use crate::region_keeper::OperatingRegionGuard;
 use crate::rpc::ddl::DropTableTask;
 use crate::rpc::router::{operating_leader_regions, RegionRoute};
-use crate::{metrics, ClusterId};

 pub struct DropTableProcedure {
    /// The context of procedure runtime.
@@ -59,8 +59,8 @@ pub struct DropTableProcedure {
 impl DropTableProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::DropTable";

-    pub fn new(cluster_id: ClusterId, task: DropTableTask, context: DdlContext) -> Self {
-        let data = DropTableData::new(cluster_id, task);
+    pub fn new(task: DropTableTask, context: DdlContext) -> Self {
+        let data = DropTableData::new(task);
        let executor = data.build_executor();
        Self {
            context,
@@ -268,7 +268,6 @@ impl Procedure for DropTableProcedure {
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DropTableData {
    pub state: DropTableState,
-    pub cluster_id: ClusterId,
    pub task: DropTableTask,
    pub physical_region_routes: Vec<RegionRoute>,
    pub physical_table_id: Option<TableId>,
@@ -279,10 +278,9 @@ pub struct DropTableData {
 }

 impl DropTableData {
-    pub fn new(cluster_id: ClusterId, task: DropTableTask) -> Self {
+    pub fn new(task: DropTableTask) -> Self {
        Self {
            state: DropTableState::Prepare,
-            cluster_id,
            task,
            physical_region_routes: vec![],
            physical_table_id: None,
@@ -301,7 +299,6 @@ impl DropTableData {

    fn build_executor(&self) -> DropTableExecutor {
        DropTableExecutor::new(
-            self.cluster_id,
            self.task.table_name(),
            self.task.table_id,
            self.task.drop_if_exists,
--- a/src/common/meta/src/ddl/drop_table/executor.rs
+++ b/src/common/meta/src/ddl/drop_table/executor.rs
@@ -36,7 +36,6 @@ use crate::instruction::CacheIdent;
 use crate::key::table_name::TableNameKey;
 use crate::key::table_route::TableRouteValue;
 use crate::rpc::router::{find_leader_regions, find_leaders, RegionRoute};
-use crate::ClusterId;

 /// [Control] indicated to the caller whether to go to the next step.
 #[derive(Debug)]
@@ -54,14 +53,8 @@ impl<T> Control<T> {

 impl DropTableExecutor {
    /// Returns the [DropTableExecutor].
-    pub fn new(
-        cluster_id: ClusterId,
-        table: TableName,
-        table_id: TableId,
-        drop_if_exists: bool,
-    ) -> Self {
+    pub fn new(table: TableName, table_id: TableId, drop_if_exists: bool) -> Self {
        Self {
-            cluster_id,
            table,
            table_id,
            drop_if_exists,
@@ -74,7 +67,6 @@ impl DropTableExecutor {
 /// - Invalidates the cache on the Frontend nodes.
 /// - Drops the regions on the Datanode nodes.
 pub struct DropTableExecutor {
-    cluster_id: ClusterId,
    table: TableName,
    table_id: TableId,
    drop_if_exists: bool,
@@ -164,7 +156,7 @@ impl DropTableExecutor {
        let detecting_regions = if table_route_value.is_physical() {
            // Safety: checked.
            let regions = table_route_value.region_routes().unwrap();
-            convert_region_routes_to_detecting_regions(self.cluster_id, regions)
+            convert_region_routes_to_detecting_regions(regions)
        } else {
            vec![]
        };
@@ -321,7 +313,6 @@ mod tests {
        let node_manager = Arc::new(MockDatanodeManager::new(()));
        let ctx = new_ddl_context(node_manager);
        let executor = DropTableExecutor::new(
-            0,
            TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_table"),
            1024,
            true,
@@ -331,7 +322,6 @@ mod tests {

        // Drops a non-exists table
        let executor = DropTableExecutor::new(
-            0,
            TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_table"),
            1024,
            false,
@@ -341,7 +331,6 @@ mod tests {

        // Drops a exists table
        let executor = DropTableExecutor::new(
-            0,
            TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_table"),
            1024,
            false,
--- a/src/common/meta/src/ddl/drop_view.rs
+++ b/src/common/meta/src/ddl/drop_view.rs
@@ -31,8 +31,8 @@ use crate::error::{self, Result};
 use crate::instruction::CacheIdent;
 use crate::key::table_name::TableNameKey;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
+use crate::metrics;
 use crate::rpc::ddl::DropViewTask;
-use crate::{metrics, ClusterId};

 /// The procedure for dropping a view.
 pub struct DropViewProcedure {
@@ -45,12 +45,11 @@ pub struct DropViewProcedure {
 impl DropViewProcedure {
    pub const TYPE_NAME: &'static str = "metasrv-procedure::DropView";

-    pub fn new(cluster_id: ClusterId, task: DropViewTask, context: DdlContext) -> Self {
+    pub fn new(task: DropViewTask, context: DdlContext) -> Self {
        Self {
            context,
            data: DropViewData {
                state: DropViewState::Prepare,
-                cluster_id,
                task,
            },
        }
@@ -216,7 +215,6 @@ impl Procedure for DropViewProcedure {
 #[derive(Debug, Serialize, Deserialize)]
 pub(crate) struct DropViewData {
    state: DropViewState,
-    cluster_id: ClusterId,
    task: DropViewTask,
 }

--- a/src/common/meta/src/ddl/flow_meta.rs
+++ b/src/common/meta/src/ddl/flow_meta.rs
@@ -20,7 +20,6 @@ use crate::error::Result;
 use crate::key::FlowId;
 use crate::peer::Peer;
 use crate::sequence::SequenceRef;
-use crate::ClusterId;

 /// The reference of [FlowMetadataAllocator].
 pub type FlowMetadataAllocatorRef = Arc<FlowMetadataAllocator>;
@@ -60,16 +59,9 @@ impl FlowMetadataAllocator {
    }

    /// Allocates the [FlowId] and [Peer]s.
-    pub async fn create(
-        &self,
-        cluster_id: ClusterId,
-        partitions: usize,
-    ) -> Result<(FlowId, Vec<Peer>)> {
+    pub async fn create(&self, partitions: usize) -> Result<(FlowId, Vec<Peer>)> {
        let flow_id = self.allocate_flow_id().await?;
-        let peers = self
-            .partition_peer_allocator
-            .alloc(cluster_id, partitions)
-            .await?;
+        let peers = self.partition_peer_allocator.alloc(partitions).await?;

        Ok((flow_id, peers))
    }
@@ -79,7 +71,7 @@ impl FlowMetadataAllocator {
 #[async_trait]
 pub trait PartitionPeerAllocator: Send + Sync {
    /// Allocates [Peer] nodes for storing partitions.
-    async fn alloc(&self, cluster_id: ClusterId, partitions: usize) -> Result<Vec<Peer>>;
+    async fn alloc(&self, partitions: usize) -> Result<Vec<Peer>>;
 }

 /// [PartitionPeerAllocatorRef] allocates [Peer]s for partitions.
@@ -89,7 +81,7 @@ struct NoopPartitionPeerAllocator;

 #[async_trait]
 impl PartitionPeerAllocator for NoopPartitionPeerAllocator {
-    async fn alloc(&self, _cluster_id: ClusterId, partitions: usize) -> Result<Vec<Peer>> {
+    async fn alloc(&self, partitions: usize) -> Result<Vec<Peer>> {
        Ok(vec![Peer::default(); partitions])
    }
 }
--- a/src/common/meta/src/ddl/table_meta.rs
+++ b/src/common/meta/src/ddl/table_meta.rs
@@ -20,7 +20,7 @@ use common_telemetry::{debug, info};
 use snafu::ensure;
 use store_api::storage::{RegionId, RegionNumber, TableId};

-use crate::ddl::{TableMetadata, TableMetadataAllocatorContext};
+use crate::ddl::TableMetadata;
 use crate::error::{self, Result, UnsupportedSnafu};
 use crate::key::table_route::PhysicalTableRouteValue;
 use crate::peer::Peer;
@@ -109,7 +109,6 @@ impl TableMetadataAllocator {

    async fn create_table_route(
        &self,
-        ctx: &TableMetadataAllocatorContext,
        table_id: TableId,
        task: &CreateTableTask,
    ) -> Result<PhysicalTableRouteValue> {
@@ -121,7 +120,7 @@ impl TableMetadataAllocator {
            }
        );

-        let peers = self.peer_allocator.alloc(ctx, regions).await?;
+        let peers = self.peer_allocator.alloc(regions).await?;
        let region_routes = task
            .partitions
            .iter()
@@ -147,11 +146,7 @@ impl TableMetadataAllocator {
    }

    /// Create VIEW metadata
-    pub async fn create_view(
-        &self,
-        _ctx: &TableMetadataAllocatorContext,
-        table_id: &Option<api::v1::TableId>,
-    ) -> Result<TableMetadata> {
+    pub async fn create_view(&self, table_id: &Option<api::v1::TableId>) -> Result<TableMetadata> {
        let table_id = self.allocate_table_id(table_id).await?;

        Ok(TableMetadata {
@@ -160,13 +155,9 @@ impl TableMetadataAllocator {
        })
    }

-    pub async fn create(
-        &self,
-        ctx: &TableMetadataAllocatorContext,
-        task: &CreateTableTask,
-    ) -> Result<TableMetadata> {
+    pub async fn create(&self, task: &CreateTableTask) -> Result<TableMetadata> {
        let table_id = self.allocate_table_id(&task.create_table.table_id).await?;
-        let table_route = self.create_table_route(ctx, table_id, task).await?;
+        let table_route = self.create_table_route(table_id, task).await?;
        let region_wal_options = self.create_wal_options(&table_route)?;

        debug!(
@@ -188,19 +179,14 @@ pub type PeerAllocatorRef = Arc<dyn PeerAllocator>;
 #[async_trait]
 pub trait PeerAllocator: Send + Sync {
    /// Allocates `regions` size [`Peer`]s.
-    async fn alloc(&self, ctx: &TableMetadataAllocatorContext, regions: usize)
-        -> Result<Vec<Peer>>;
+    async fn alloc(&self, regions: usize) -> Result<Vec<Peer>>;
 }

 struct NoopPeerAllocator;

 #[async_trait]
 impl PeerAllocator for NoopPeerAllocator {
-    async fn alloc(
-        &self,
-        _ctx: &TableMetadataAllocatorContext,
-        regions: usize,
-    ) -> Result<Vec<Peer>> {
+    async fn alloc(&self, regions: usize) -> Result<Vec<Peer>> {
        Ok(vec![Peer::default(); regions])
    }
 }
--- a/src/common/meta/src/ddl/test_util.rs
+++ b/src/common/meta/src/ddl/test_util.rs
@@ -31,10 +31,9 @@ use crate::ddl::test_util::columns::TestColumnDefBuilder;
 use crate::ddl::test_util::create_table::{
    build_raw_table_info_from_expr, TestCreateTableExprBuilder,
 };
-use crate::ddl::{DdlContext, TableMetadata, TableMetadataAllocatorContext};
+use crate::ddl::{DdlContext, TableMetadata};
 use crate::key::table_route::TableRouteValue;
 use crate::rpc::ddl::CreateTableTask;
-use crate::ClusterId;

 pub async fn create_physical_table_metadata(
    ddl_context: &DdlContext,
@@ -48,11 +47,7 @@ pub async fn create_physical_table_metadata(
        .unwrap();
 }

-pub async fn create_physical_table(
-    ddl_context: &DdlContext,
-    cluster_id: ClusterId,
-    name: &str,
-) -> TableId {
+pub async fn create_physical_table(ddl_context: &DdlContext, name: &str) -> TableId {
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task(name);
    let TableMetadata {
@@ -61,10 +56,7 @@ pub async fn create_physical_table(
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -80,15 +72,13 @@ pub async fn create_physical_table(

 pub async fn create_logical_table(
    ddl_context: DdlContext,
-    cluster_id: ClusterId,
    physical_table_id: TableId,
    table_name: &str,
 ) -> TableId {
    use std::assert_matches::assert_matches;

    let tasks = vec![test_create_logical_table_task(table_name)];
-    let mut procedure =
-        CreateLogicalTablesProcedure::new(cluster_id, tasks, physical_table_id, ddl_context);
+    let mut procedure = CreateLogicalTablesProcedure::new(tasks, physical_table_id, ddl_context);
    let status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Executing { persist: true });
    let status = procedure.on_create_metadata().await.unwrap();
--- a/src/common/meta/src/ddl/tests/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/tests/alter_logical_tables.rs
@@ -86,7 +86,6 @@ fn make_alter_logical_table_rename_task(
 async fn test_on_prepare_check_schema() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let tasks = vec![
        make_alter_logical_table_add_column_task(
            Some("schema1"),
@@ -100,8 +99,7 @@ async fn test_on_prepare_check_schema() {
        ),
    ];
    let physical_table_id = 1024u32;
-    let mut procedure =
-        AlterLogicalTablesProcedure::new(cluster_id, tasks, physical_table_id, ddl_context);
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, physical_table_id, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, AlterLogicalTablesInvalidArguments { .. });
 }
@@ -110,50 +108,46 @@ async fn test_on_prepare_check_schema() {
 async fn test_on_prepare_check_alter_kind() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let tasks = vec![make_alter_logical_table_rename_task(
        "schema1",
        "table1",
        "new_table1",
    )];
    let physical_table_id = 1024u32;
-    let mut procedure =
-        AlterLogicalTablesProcedure::new(cluster_id, tasks, physical_table_id, ddl_context);
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, physical_table_id, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, AlterLogicalTablesInvalidArguments { .. });
 }

 #[tokio::test]
 async fn test_on_prepare_different_physical_table() {
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);

-    let phy1_id = create_physical_table(&ddl_context, cluster_id, "phy1").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy1_id, "table1").await;
-    let phy2_id = create_physical_table(&ddl_context, cluster_id, "phy2").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy2_id, "table2").await;
+    let phy1_id = create_physical_table(&ddl_context, "phy1").await;
+    create_logical_table(ddl_context.clone(), phy1_id, "table1").await;
+    let phy2_id = create_physical_table(&ddl_context, "phy2").await;
+    create_logical_table(ddl_context.clone(), phy2_id, "table2").await;

    let tasks = vec![
        make_alter_logical_table_add_column_task(None, "table1", vec!["column1".to_string()]),
        make_alter_logical_table_add_column_task(None, "table2", vec!["column2".to_string()]),
    ];

-    let mut procedure = AlterLogicalTablesProcedure::new(cluster_id, tasks, phy1_id, ddl_context);
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy1_id, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, AlterLogicalTablesInvalidArguments { .. });
 }

 #[tokio::test]
 async fn test_on_prepare_logical_table_not_exists() {
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);

    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(&ddl_context, "phy").await;
    // Creates 3 logical tables
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table1").await;

    let tasks = vec![
        make_alter_logical_table_add_column_task(None, "table1", vec!["column1".to_string()]),
@@ -161,23 +155,22 @@ async fn test_on_prepare_logical_table_not_exists() {
        make_alter_logical_table_add_column_task(None, "table2", vec!["column2".to_string()]),
    ];

-    let mut procedure = AlterLogicalTablesProcedure::new(cluster_id, tasks, phy_id, ddl_context);
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy_id, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, TableNotFound { .. });
 }

 #[tokio::test]
 async fn test_on_prepare() {
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);

    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(&ddl_context, "phy").await;
    // Creates 3 logical tables
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table3").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table1").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table2").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table3").await;

    let tasks = vec![
        make_alter_logical_table_add_column_task(None, "table1", vec!["column1".to_string()]),
@@ -185,25 +178,24 @@ async fn test_on_prepare() {
        make_alter_logical_table_add_column_task(None, "table3", vec!["column3".to_string()]),
    ];

-    let mut procedure = AlterLogicalTablesProcedure::new(cluster_id, tasks, phy_id, ddl_context);
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy_id, ddl_context);
    let result = procedure.on_prepare().await;
    assert_matches!(result, Ok(Status::Executing { persist: true }));
 }

 #[tokio::test]
 async fn test_on_update_metadata() {
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);

    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(&ddl_context, "phy").await;
    // Creates 3 logical tables
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table3").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table4").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table5").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table1").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table2").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table3").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table4").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table5").await;

    let tasks = vec![
        make_alter_logical_table_add_column_task(None, "table1", vec!["new_col".to_string()]),
@@ -211,7 +203,7 @@ async fn test_on_update_metadata() {
        make_alter_logical_table_add_column_task(None, "table3", vec!["new_col".to_string()]),
    ];

-    let mut procedure = AlterLogicalTablesProcedure::new(cluster_id, tasks, phy_id, ddl_context);
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy_id, ddl_context);
    let mut status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Executing { persist: true });

@@ -229,23 +221,21 @@ async fn test_on_update_metadata() {

 #[tokio::test]
 async fn test_on_part_duplicate_alter_request() {
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);

    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(&ddl_context, "phy").await;
    // Creates 3 logical tables
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table1").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table2").await;

    let tasks = vec![
        make_alter_logical_table_add_column_task(None, "table1", vec!["col_0".to_string()]),
        make_alter_logical_table_add_column_task(None, "table2", vec!["col_0".to_string()]),
    ];

-    let mut procedure =
-        AlterLogicalTablesProcedure::new(cluster_id, tasks, phy_id, ddl_context.clone());
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy_id, ddl_context.clone());
    let mut status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Executing { persist: true });

@@ -278,8 +268,7 @@ async fn test_on_part_duplicate_alter_request() {
        ),
    ];

-    let mut procedure =
-        AlterLogicalTablesProcedure::new(cluster_id, tasks, phy_id, ddl_context.clone());
+    let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy_id, ddl_context.clone());
    let mut status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Executing { persist: true });

--- a/src/common/meta/src/ddl/tests/alter_table.rs
+++ b/src/common/meta/src/ddl/tests/alter_table.rs
@@ -59,7 +59,6 @@ fn test_rename_alter_table_task(table_name: &str, new_table_name: &str) -> Alter
 async fn test_on_prepare_table_exists_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_table_task("foo", 1024);
    // Puts a value to table name key.
    ddl_context
@@ -73,7 +72,7 @@ async fn test_on_prepare_table_exists_err() {
        .unwrap();

    let task = test_rename_alter_table_task("non-exists", "foo");
-    let mut procedure = AlterTableProcedure::new(cluster_id, 1024, task, ddl_context).unwrap();
+    let mut procedure = AlterTableProcedure::new(1024, task, ddl_context).unwrap();
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err.status_code(), StatusCode::TableAlreadyExists);
 }
@@ -82,9 +81,8 @@ async fn test_on_prepare_table_exists_err() {
 async fn test_on_prepare_table_not_exists_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_rename_alter_table_task("non-exists", "foo");
-    let mut procedure = AlterTableProcedure::new(cluster_id, 1024, task, ddl_context).unwrap();
+    let mut procedure = AlterTableProcedure::new(1024, task, ddl_context).unwrap();
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err.status_code(), StatusCode::TableNotFound);
 }
@@ -95,7 +93,6 @@ async fn test_on_submit_alter_request() {
    let datanode_handler = DatanodeWatcher(tx);
    let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_id = 1024;
    let table_name = "foo";
    let task = test_create_table_task(table_name, table_id);
@@ -144,8 +141,7 @@ async fn test_on_submit_alter_request() {
            })),
        },
    };
-    let mut procedure =
-        AlterTableProcedure::new(cluster_id, table_id, alter_table_task, ddl_context).unwrap();
+    let mut procedure = AlterTableProcedure::new(table_id, alter_table_task, ddl_context).unwrap();
    procedure.on_prepare().await.unwrap();
    procedure.submit_alter_region_requests().await.unwrap();

@@ -181,7 +177,6 @@ async fn test_on_submit_alter_request_with_outdated_request() {
        RequestOutdatedErrorDatanodeHandler,
    ));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_id = 1024;
    let table_name = "foo";
    let task = test_create_table_task(table_name, table_id);
@@ -230,8 +225,7 @@ async fn test_on_submit_alter_request_with_outdated_request() {
            })),
        },
    };
-    let mut procedure =
-        AlterTableProcedure::new(cluster_id, table_id, alter_table_task, ddl_context).unwrap();
+    let mut procedure = AlterTableProcedure::new(table_id, alter_table_task, ddl_context).unwrap();
    procedure.on_prepare().await.unwrap();
    procedure.submit_alter_region_requests().await.unwrap();
 }
@@ -240,7 +234,6 @@ async fn test_on_submit_alter_request_with_outdated_request() {
 async fn test_on_update_metadata_rename() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_name = "foo";
    let new_table_name = "bar";
    let table_id = 1024;
@@ -257,8 +250,7 @@ async fn test_on_update_metadata_rename() {
        .unwrap();

    let task = test_rename_alter_table_task(table_name, new_table_name);
-    let mut procedure =
-        AlterTableProcedure::new(cluster_id, table_id, task, ddl_context.clone()).unwrap();
+    let mut procedure = AlterTableProcedure::new(table_id, task, ddl_context.clone()).unwrap();
    procedure.on_prepare().await.unwrap();
    procedure.on_update_metadata().await.unwrap();

@@ -291,7 +283,6 @@ async fn test_on_update_metadata_rename() {
 async fn test_on_update_metadata_add_columns() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_name = "foo";
    let table_id = 1024;
    let task = test_create_table_task(table_name, table_id);
@@ -335,8 +326,7 @@ async fn test_on_update_metadata_add_columns() {
            })),
        },
    };
-    let mut procedure =
-        AlterTableProcedure::new(cluster_id, table_id, task, ddl_context.clone()).unwrap();
+    let mut procedure = AlterTableProcedure::new(table_id, task, ddl_context.clone()).unwrap();
    procedure.on_prepare().await.unwrap();
    procedure.submit_alter_region_requests().await.unwrap();
    procedure.on_update_metadata().await.unwrap();
@@ -361,7 +351,6 @@ async fn test_on_update_metadata_add_columns() {
 async fn test_on_update_table_options() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_name = "foo";
    let table_id = 1024;
    let task = test_create_table_task(table_name, table_id);
@@ -398,8 +387,7 @@ async fn test_on_update_table_options() {
            })),
        },
    };
-    let mut procedure =
-        AlterTableProcedure::new(cluster_id, table_id, task, ddl_context.clone()).unwrap();
+    let mut procedure = AlterTableProcedure::new(table_id, task, ddl_context.clone()).unwrap();
    procedure.on_prepare().await.unwrap();
    procedure.submit_alter_region_requests().await.unwrap();
    procedure.on_update_metadata().await.unwrap();
--- a/src/common/meta/src/ddl/tests/create_flow.rs
+++ b/src/common/meta/src/ddl/tests/create_flow.rs
@@ -25,11 +25,11 @@ use crate::ddl::create_flow::CreateFlowProcedure;
 use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
 use crate::ddl::DdlContext;
+use crate::error;
 use crate::key::table_route::TableRouteValue;
 use crate::key::FlowId;
 use crate::rpc::ddl::CreateFlowTask;
 use crate::test_util::{new_ddl_context, MockFlownodeManager};
-use crate::{error, ClusterId};

 pub(crate) fn test_create_flow_task(
    name: &str,
@@ -53,7 +53,6 @@ pub(crate) fn test_create_flow_task(

 #[tokio::test]
 async fn test_create_flow_source_table_not_found() {
-    let cluster_id = 1;
    let source_table_names = vec![TableName::new(
        DEFAULT_CATALOG_NAME,
        DEFAULT_SCHEMA_NAME,
@@ -65,14 +64,13 @@ async fn test_create_flow_source_table_not_found() {
    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
    let ddl_context = new_ddl_context(node_manager);
    let query_ctx = QueryContext::arc().into();
-    let mut procedure = CreateFlowProcedure::new(cluster_id, task, query_ctx, ddl_context);
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, error::Error::TableNotFound { .. });
 }

 pub(crate) async fn create_test_flow(
    ddl_context: &DdlContext,
-    cluster_id: ClusterId,
    flow_name: &str,
    source_table_names: Vec<TableName>,
    sink_table_name: TableName,
@@ -84,8 +82,7 @@ pub(crate) async fn create_test_flow(
        false,
    );
    let query_ctx = QueryContext::arc().into();
-    let mut procedure =
-        CreateFlowProcedure::new(cluster_id, task.clone(), query_ctx, ddl_context.clone());
+    let mut procedure = CreateFlowProcedure::new(task.clone(), query_ctx, ddl_context.clone());
    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
    let flow_id = output.downcast_ref::<FlowId>().unwrap();

@@ -94,7 +91,6 @@ pub(crate) async fn create_test_flow(

 #[tokio::test]
 async fn test_create_flow() {
-    let cluster_id = 1;
    let table_id = 1024;
    let source_table_names = vec![TableName::new(
        DEFAULT_CATALOG_NAME,
@@ -118,7 +114,6 @@ async fn test_create_flow() {
        .unwrap();
    let flow_id = create_test_flow(
        &ddl_context,
-        cluster_id,
        "my_flow",
        source_table_names.clone(),
        sink_table_name.clone(),
@@ -134,8 +129,7 @@ async fn test_create_flow() {
        true,
    );
    let query_ctx = QueryContext::arc().into();
-    let mut procedure =
-        CreateFlowProcedure::new(cluster_id, task.clone(), query_ctx, ddl_context.clone());
+    let mut procedure = CreateFlowProcedure::new(task.clone(), query_ctx, ddl_context.clone());
    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
    let flow_id = output.downcast_ref::<FlowId>().unwrap();
    assert_eq!(*flow_id, 1024);
@@ -143,7 +137,7 @@ async fn test_create_flow() {
    // Creates again
    let task = test_create_flow_task("my_flow", source_table_names, sink_table_name, false);
    let query_ctx = QueryContext::arc().into();
-    let mut procedure = CreateFlowProcedure::new(cluster_id, task.clone(), query_ctx, ddl_context);
+    let mut procedure = CreateFlowProcedure::new(task.clone(), query_ctx, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, error::Error::FlowAlreadyExists { .. });
 }
--- a/src/common/meta/src/ddl/tests/create_logical_tables.rs
+++ b/src/common/meta/src/ddl/tests/create_logical_tables.rs
@@ -26,7 +26,7 @@ use crate::ddl::test_util::datanode_handler::NaiveDatanodeHandler;
 use crate::ddl::test_util::{
    create_physical_table_metadata, test_create_logical_table_task, test_create_physical_table_task,
 };
-use crate::ddl::{TableMetadata, TableMetadataAllocatorContext};
+use crate::ddl::TableMetadata;
 use crate::error::Error;
 use crate::key::table_route::TableRouteValue;
 use crate::test_util::{new_ddl_context, MockDatanodeManager};
@@ -35,11 +35,9 @@ use crate::test_util::{new_ddl_context, MockDatanodeManager};
 async fn test_on_prepare_physical_table_not_found() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let tasks = vec![test_create_logical_table_task("foo")];
    let physical_table_id = 1024u32;
-    let mut procedure =
-        CreateLogicalTablesProcedure::new(cluster_id, tasks, physical_table_id, ddl_context);
+    let mut procedure = CreateLogicalTablesProcedure::new(tasks, physical_table_id, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, Error::TableRouteNotFound { .. });
 }
@@ -48,7 +46,6 @@ async fn test_on_prepare_physical_table_not_found() {
 async fn test_on_prepare() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -57,10 +54,7 @@ async fn test_on_prepare() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -73,8 +67,7 @@ async fn test_on_prepare() {
    // The create logical table procedure.
    let tasks = vec![test_create_logical_table_task("foo")];
    let physical_table_id = table_id;
-    let mut procedure =
-        CreateLogicalTablesProcedure::new(cluster_id, tasks, physical_table_id, ddl_context);
+    let mut procedure = CreateLogicalTablesProcedure::new(tasks, physical_table_id, ddl_context);
    let status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Executing { persist: true });
 }
@@ -83,7 +76,6 @@ async fn test_on_prepare() {
 async fn test_on_prepare_logical_table_exists_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -92,10 +84,7 @@ async fn test_on_prepare_logical_table_exists_err() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -119,7 +108,7 @@ async fn test_on_prepare_logical_table_exists_err() {
    // The create logical table procedure.
    let physical_table_id = table_id;
    let mut procedure =
-        CreateLogicalTablesProcedure::new(cluster_id, vec![task], physical_table_id, ddl_context);
+        CreateLogicalTablesProcedure::new(vec![task], physical_table_id, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, Error::TableAlreadyExists { .. });
    assert_eq!(err.status_code(), StatusCode::TableAlreadyExists);
@@ -129,7 +118,6 @@ async fn test_on_prepare_logical_table_exists_err() {
 async fn test_on_prepare_with_create_if_table_exists() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -138,10 +126,7 @@ async fn test_on_prepare_with_create_if_table_exists() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -167,7 +152,7 @@ async fn test_on_prepare_with_create_if_table_exists() {
    // Sets `create_if_not_exists`
    task.create_table.create_if_not_exists = true;
    let mut procedure =
-        CreateLogicalTablesProcedure::new(cluster_id, vec![task], physical_table_id, ddl_context);
+        CreateLogicalTablesProcedure::new(vec![task], physical_table_id, ddl_context);
    let status = procedure.on_prepare().await.unwrap();
    let output = status.downcast_output_ref::<Vec<u32>>().unwrap();
    assert_eq!(*output, vec![8192]);
@@ -177,7 +162,6 @@ async fn test_on_prepare_with_create_if_table_exists() {
 async fn test_on_prepare_part_logical_tables_exist() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -186,10 +170,7 @@ async fn test_on_prepare_part_logical_tables_exist() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -216,7 +197,6 @@ async fn test_on_prepare_part_logical_tables_exist() {
    task.create_table.create_if_not_exists = true;
    let non_exist_task = test_create_logical_table_task("non_exists");
    let mut procedure = CreateLogicalTablesProcedure::new(
-        cluster_id,
        vec![task, non_exist_task],
        physical_table_id,
        ddl_context,
@@ -229,7 +209,6 @@ async fn test_on_prepare_part_logical_tables_exist() {
 async fn test_on_create_metadata() {
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -238,10 +217,7 @@ async fn test_on_create_metadata() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -257,7 +233,6 @@ async fn test_on_create_metadata() {
    let task = test_create_logical_table_task("foo");
    let yet_another_task = test_create_logical_table_task("bar");
    let mut procedure = CreateLogicalTablesProcedure::new(
-        cluster_id,
        vec![task, yet_another_task],
        physical_table_id,
        ddl_context,
@@ -279,7 +254,6 @@ async fn test_on_create_metadata() {
 async fn test_on_create_metadata_part_logical_tables_exist() {
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -288,10 +262,7 @@ async fn test_on_create_metadata_part_logical_tables_exist() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -318,7 +289,6 @@ async fn test_on_create_metadata_part_logical_tables_exist() {
    task.create_table.create_if_not_exists = true;
    let non_exist_task = test_create_logical_table_task("non_exists");
    let mut procedure = CreateLogicalTablesProcedure::new(
-        cluster_id,
        vec![task, non_exist_task],
        physical_table_id,
        ddl_context,
@@ -340,7 +310,6 @@ async fn test_on_create_metadata_part_logical_tables_exist() {
 async fn test_on_create_metadata_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -349,10 +318,7 @@ async fn test_on_create_metadata_err() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -368,7 +334,6 @@ async fn test_on_create_metadata_err() {
    let task = test_create_logical_table_task("foo");
    let yet_another_task = test_create_logical_table_task("bar");
    let mut procedure = CreateLogicalTablesProcedure::new(
-        cluster_id,
        vec![task.clone(), yet_another_task],
        physical_table_id,
        ddl_context.clone(),
--- a/src/common/meta/src/ddl/tests/create_table.rs
+++ b/src/common/meta/src/ddl/tests/create_table.rs
@@ -87,7 +87,6 @@ pub(crate) fn test_create_table_task(name: &str) -> CreateTableTask {
 async fn test_on_prepare_table_exists_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_table_task("foo");
    assert!(!task.create_table.create_if_not_exists);
    // Puts a value to table name key.
@@ -100,7 +99,7 @@ async fn test_on_prepare_table_exists_err() {
        )
        .await
        .unwrap();
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateTableProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, Error::TableAlreadyExists { .. });
    assert_eq!(err.status_code(), StatusCode::TableAlreadyExists);
@@ -110,7 +109,6 @@ async fn test_on_prepare_table_exists_err() {
 async fn test_on_prepare_with_create_if_table_exists() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let mut task = test_create_table_task("foo");
    task.create_table.create_if_not_exists = true;
    task.table_info.ident.table_id = 1024;
@@ -124,7 +122,7 @@ async fn test_on_prepare_with_create_if_table_exists() {
        )
        .await
        .unwrap();
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateTableProcedure::new(task, ddl_context);
    let status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Done { output: Some(..) });
    let table_id = *status.downcast_output_ref::<u32>().unwrap();
@@ -135,10 +133,9 @@ async fn test_on_prepare_with_create_if_table_exists() {
 async fn test_on_prepare_without_create_if_table_exists() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let mut task = test_create_table_task("foo");
    task.create_table.create_if_not_exists = true;
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateTableProcedure::new(task, ddl_context);
    let status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Executing { persist: true });
    assert_eq!(procedure.table_id(), 1024);
@@ -148,11 +145,10 @@ async fn test_on_prepare_without_create_if_table_exists() {
 async fn test_on_prepare_with_no_partition_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let mut task = test_create_table_task("foo");
    task.partitions = vec![];
    task.create_table.create_if_not_exists = true;
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateTableProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, Error::Unexpected { .. });
    assert!(err
@@ -165,10 +161,9 @@ async fn test_on_datanode_create_regions_should_retry() {
    common_telemetry::init_default_ut_logging();
    let node_manager = Arc::new(MockDatanodeManager::new(RetryErrorDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_table_task("foo");
    assert!(!task.create_table.create_if_not_exists);
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateTableProcedure::new(task, ddl_context);
    procedure.on_prepare().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
@@ -183,10 +178,9 @@ async fn test_on_datanode_create_regions_should_not_retry() {
    common_telemetry::init_default_ut_logging();
    let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_table_task("foo");
    assert!(!task.create_table.create_if_not_exists);
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateTableProcedure::new(task, ddl_context);
    procedure.on_prepare().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
@@ -201,10 +195,9 @@ async fn test_on_create_metadata_error() {
    common_telemetry::init_default_ut_logging();
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_table_task("foo");
    assert!(!task.create_table.create_if_not_exists);
-    let mut procedure = CreateTableProcedure::new(cluster_id, task.clone(), ddl_context.clone());
+    let mut procedure = CreateTableProcedure::new(task.clone(), ddl_context.clone());
    procedure.on_prepare().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
@@ -233,10 +226,9 @@ async fn test_on_create_metadata() {
    common_telemetry::init_default_ut_logging();
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_table_task("foo");
    assert!(!task.create_table.create_if_not_exists);
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateTableProcedure::new(task, ddl_context);
    procedure.on_prepare().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
@@ -251,14 +243,12 @@ async fn test_on_create_metadata() {

 #[tokio::test]
 async fn test_memory_region_keeper_guard_dropped_on_procedure_done() {
-    let cluster_id = 1;
-
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let kv_backend = Arc::new(MemoryKvBackend::new());
    let ddl_context = new_ddl_context_with_kv_backend(node_manager, kv_backend);

    let task = test_create_table_task("foo");
-    let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = CreateTableProcedure::new(task, ddl_context.clone());

    execute_procedure_until(&mut procedure, |p| {
        p.creator.data.state == CreateTableState::CreateMetadata
--- a/src/common/meta/src/ddl/tests/create_view.rs
+++ b/src/common/meta/src/ddl/tests/create_view.rs
@@ -97,7 +97,6 @@ pub(crate) fn test_create_view_task(name: &str) -> CreateViewTask {
 async fn test_on_prepare_view_exists_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_view_task("foo");
    assert!(!task.create_view.create_if_not_exists);
    // Puts a value to table name key.
@@ -113,7 +112,7 @@ async fn test_on_prepare_view_exists_err() {
        )
        .await
        .unwrap();
-    let mut procedure = CreateViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateViewProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, Error::ViewAlreadyExists { .. });
    assert_eq!(err.status_code(), StatusCode::TableAlreadyExists);
@@ -123,7 +122,6 @@ async fn test_on_prepare_view_exists_err() {
 async fn test_on_prepare_with_create_if_view_exists() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let mut task = test_create_view_task("foo");
    task.create_view.create_if_not_exists = true;
    task.view_info.ident.table_id = 1024;
@@ -140,7 +138,7 @@ async fn test_on_prepare_with_create_if_view_exists() {
        )
        .await
        .unwrap();
-    let mut procedure = CreateViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateViewProcedure::new(task, ddl_context);
    let status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Done { output: Some(..) });
    let table_id = *status.downcast_output_ref::<u32>().unwrap();
@@ -151,10 +149,9 @@ async fn test_on_prepare_with_create_if_view_exists() {
 async fn test_on_prepare_without_create_if_table_exists() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let mut task = test_create_view_task("foo");
    task.create_view.create_if_not_exists = true;
-    let mut procedure = CreateViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateViewProcedure::new(task, ddl_context);
    let status = procedure.on_prepare().await.unwrap();
    assert_matches!(status, Status::Executing { persist: true });
    assert_eq!(procedure.view_id(), 1024);
@@ -165,10 +162,9 @@ async fn test_on_create_metadata() {
    common_telemetry::init_default_ut_logging();
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let task = test_create_view_task("foo");
    assert!(!task.create_view.create_if_not_exists);
-    let mut procedure = CreateViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = CreateViewProcedure::new(task, ddl_context);
    procedure.on_prepare().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
@@ -185,10 +181,9 @@ async fn test_replace_view_metadata() {
    common_telemetry::init_default_ut_logging();
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager.clone());
-    let cluster_id = 1;
    let task = test_create_view_task("foo");
    assert!(!task.create_view.create_if_not_exists);
-    let mut procedure = CreateViewProcedure::new(cluster_id, task.clone(), ddl_context.clone());
+    let mut procedure = CreateViewProcedure::new(task.clone(), ddl_context.clone());
    procedure.on_prepare().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
@@ -213,7 +208,7 @@ async fn test_replace_view_metadata() {
    let mut task = test_create_view_task("foo");
    // The view already exists, prepare should fail
    {
-        let mut procedure = CreateViewProcedure::new(cluster_id, task.clone(), ddl_context.clone());
+        let mut procedure = CreateViewProcedure::new(task.clone(), ddl_context.clone());
        let err = procedure.on_prepare().await.unwrap_err();
        assert_matches!(err, Error::ViewAlreadyExists { .. });
        assert_eq!(err.status_code(), StatusCode::TableAlreadyExists);
@@ -224,7 +219,7 @@ async fn test_replace_view_metadata() {
    task.create_view.logical_plan = vec![4, 5, 6];
    task.create_view.definition = "new_definition".to_string();

-    let mut procedure = CreateViewProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = CreateViewProcedure::new(task, ddl_context.clone());
    procedure.on_prepare().await.unwrap();
    let ctx = ProcedureContext {
        procedure_id: ProcedureId::random(),
@@ -254,12 +249,11 @@ async fn test_replace_table() {
    common_telemetry::init_default_ut_logging();
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager.clone());
-    let cluster_id = 1;

    {
        // Create a `foo` table.
        let task = test_create_table_task("foo");
-        let mut procedure = CreateTableProcedure::new(cluster_id, task, ddl_context.clone());
+        let mut procedure = CreateTableProcedure::new(task, ddl_context.clone());
        procedure.on_prepare().await.unwrap();
        let ctx = ProcedureContext {
            procedure_id: ProcedureId::random(),
@@ -272,7 +266,7 @@ async fn test_replace_table() {
    // Try to replace a view named `foo` too.
    let mut task = test_create_view_task("foo");
    task.create_view.or_replace = true;
-    let mut procedure = CreateViewProcedure::new(cluster_id, task.clone(), ddl_context.clone());
+    let mut procedure = CreateViewProcedure::new(task.clone(), ddl_context.clone());
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, Error::TableAlreadyExists { .. });
    assert_eq!(err.status_code(), StatusCode::TableAlreadyExists);
--- a/src/common/meta/src/ddl/tests/drop_database.rs
+++ b/src/common/meta/src/ddl/tests/drop_database.rs
@@ -31,7 +31,6 @@ use crate::test_util::{new_ddl_context, MockDatanodeManager};
 #[tokio::test]
 async fn test_drop_database_with_logical_tables() {
    common_telemetry::init_default_ut_logging();
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
    ddl_context
@@ -45,11 +44,11 @@ async fn test_drop_database_with_logical_tables() {
        .await
        .unwrap();
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(&ddl_context, "phy").await;
    // Creates 3 logical tables
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table3").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table1").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table2").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table3").await;

    let mut procedure = DropDatabaseProcedure::new(
        DEFAULT_CATALOG_NAME.to_string(),
@@ -80,7 +79,6 @@ async fn test_drop_database_with_logical_tables() {
 #[tokio::test]
 async fn test_drop_database_retryable_error() {
    common_telemetry::init_default_ut_logging();
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(RetryErrorDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
    ddl_context
@@ -94,11 +92,11 @@ async fn test_drop_database_retryable_error() {
        .await
        .unwrap();
    // Creates physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(&ddl_context, "phy").await;
    // Creates 3 logical tables
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table2").await;
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table3").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table1").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table2").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table3").await;

    let mut procedure = DropDatabaseProcedure::new(
        DEFAULT_CATALOG_NAME.to_string(),
@@ -128,7 +126,6 @@ async fn test_drop_database_retryable_error() {
 #[tokio::test]
 async fn test_drop_database_recover() {
    common_telemetry::init_default_ut_logging();
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let ddl_context = new_ddl_context(node_manager);
    ddl_context
@@ -142,9 +139,9 @@ async fn test_drop_database_recover() {
        .await
        .unwrap();
    // Creates a physical table
-    let phy_id = create_physical_table(&ddl_context, cluster_id, "phy").await;
+    let phy_id = create_physical_table(&ddl_context, "phy").await;
    // Creates a logical tables
-    create_logical_table(ddl_context.clone(), cluster_id, phy_id, "table1").await;
+    create_logical_table(ddl_context.clone(), phy_id, "table1").await;
    let mut procedure = DropDatabaseProcedure::new(
        DEFAULT_CATALOG_NAME.to_string(),
        DEFAULT_SCHEMA_NAME.to_string(),
--- a/src/common/meta/src/ddl/tests/drop_flow.rs
+++ b/src/common/meta/src/ddl/tests/drop_flow.rs
@@ -40,12 +40,11 @@ fn test_drop_flow_task(flow_name: &str, flow_id: u32, drop_if_exists: bool) -> D

 #[tokio::test]
 async fn test_drop_flow_not_found() {
-    let cluster_id = 1;
    let flow_id = 1024;
    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
    let ddl_context = new_ddl_context(node_manager);
    let task = test_drop_flow_task("my_flow", flow_id, false);
-    let mut procedure = DropFlowProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropFlowProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, error::Error::FlowNotFound { .. });
 }
@@ -53,7 +52,6 @@ async fn test_drop_flow_not_found() {
 #[tokio::test]
 async fn test_drop_flow() {
    // create a flow
-    let cluster_id = 1;
    let table_id = 1024;
    let source_table_names = vec![TableName::new(
        DEFAULT_CATALOG_NAME,
@@ -75,27 +73,21 @@ async fn test_drop_flow() {
        )
        .await
        .unwrap();
-    let flow_id = create_test_flow(
-        &ddl_context,
-        cluster_id,
-        "my_flow",
-        source_table_names,
-        sink_table_name,
-    )
-    .await;
+    let flow_id =
+        create_test_flow(&ddl_context, "my_flow", source_table_names, sink_table_name).await;
    // Drops the flows
    let task = test_drop_flow_task("my_flow", flow_id, false);
-    let mut procedure = DropFlowProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = DropFlowProcedure::new(task, ddl_context.clone());
    execute_procedure_until_done(&mut procedure).await;

    // Drops if not exists
    let task = test_drop_flow_task("my_flow", flow_id, true);
-    let mut procedure = DropFlowProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = DropFlowProcedure::new(task, ddl_context.clone());
    execute_procedure_until_done(&mut procedure).await;

    // Drops again
    let task = test_drop_flow_task("my_flow", flow_id, false);
-    let mut procedure = DropFlowProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropFlowProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, error::Error::FlowNotFound { .. });
 }
--- a/src/common/meta/src/ddl/tests/drop_table.rs
+++ b/src/common/meta/src/ddl/tests/drop_table.rs
@@ -35,7 +35,7 @@ use crate::ddl::test_util::{
    create_logical_table, create_physical_table, create_physical_table_metadata,
    test_create_logical_table_task, test_create_physical_table_task,
 };
-use crate::ddl::{TableMetadata, TableMetadataAllocatorContext};
+use crate::ddl::TableMetadata;
 use crate::key::table_route::TableRouteValue;
 use crate::kv_backend::memory::MemoryKvBackend;
 use crate::peer::Peer;
@@ -47,7 +47,6 @@ use crate::test_util::{new_ddl_context, new_ddl_context_with_kv_backend, MockDat
 async fn test_on_prepare_table_not_exists_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_name = "foo";
    let table_id = 1024;
    let task = test_create_table_task(table_name, table_id);
@@ -63,7 +62,7 @@ async fn test_on_prepare_table_not_exists_err() {
        .unwrap();

    let task = new_drop_table_task("bar", table_id, false);
-    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropTableProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_eq!(err.status_code(), StatusCode::TableNotFound);
 }
@@ -72,7 +71,6 @@ async fn test_on_prepare_table_not_exists_err() {
 async fn test_on_prepare_table() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_name = "foo";
    let table_id = 1024;
    let task = test_create_table_task(table_name, table_id);
@@ -89,13 +87,13 @@ async fn test_on_prepare_table() {

    let task = new_drop_table_task("bar", table_id, true);
    // Drop if exists
-    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = DropTableProcedure::new(task, ddl_context.clone());
    procedure.on_prepare().await.unwrap();
    assert!(!procedure.rollback_supported());

    let task = new_drop_table_task(table_name, table_id, false);
    // Drop table
-    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropTableProcedure::new(task, ddl_context);
    procedure.on_prepare().await.unwrap();
 }

@@ -105,7 +103,6 @@ async fn test_on_datanode_drop_regions() {
    let datanode_handler = DatanodeWatcher(tx);
    let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let table_id = 1024;
    let table_name = "foo";
    let task = test_create_table_task(table_name, table_id);
@@ -144,7 +141,7 @@ async fn test_on_datanode_drop_regions() {

    let task = new_drop_table_task(table_name, table_id, false);
    // Drop table
-    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropTableProcedure::new(task, ddl_context);
    procedure.on_prepare().await.unwrap();
    procedure.on_datanode_drop_regions().await.unwrap();

@@ -179,7 +176,6 @@ async fn test_on_rollback() {
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let kv_backend = Arc::new(MemoryKvBackend::new());
    let ddl_context = new_ddl_context_with_kv_backend(node_manager, kv_backend.clone());
-    let cluster_id = 1;
    // Prepares physical table metadata.
    let mut create_physical_table_task = test_create_physical_table_task("phy_table");
    let TableMetadata {
@@ -188,10 +184,7 @@ async fn test_on_rollback() {
        ..
    } = ddl_context
        .table_metadata_allocator
-        .create(
-            &TableMetadataAllocatorContext { cluster_id },
-            &create_physical_table_task,
-        )
+        .create(&create_physical_table_task)
        .await
        .unwrap();
    create_physical_table_task.set_table_id(table_id);
@@ -205,12 +198,8 @@ async fn test_on_rollback() {
    let physical_table_id = table_id;
    // Creates the logical table metadata.
    let task = test_create_logical_table_task("foo");
-    let mut procedure = CreateLogicalTablesProcedure::new(
-        cluster_id,
-        vec![task],
-        physical_table_id,
-        ddl_context.clone(),
-    );
+    let mut procedure =
+        CreateLogicalTablesProcedure::new(vec![task], physical_table_id, ddl_context.clone());
    procedure.on_prepare().await.unwrap();
    let ctx = new_test_procedure_context();
    procedure.execute(&ctx).await.unwrap();
@@ -223,7 +212,7 @@ async fn test_on_rollback() {
    // Drops the physical table
    {
        let task = new_drop_table_task("phy_table", physical_table_id, false);
-        let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+        let mut procedure = DropTableProcedure::new(task, ddl_context.clone());
        procedure.on_prepare().await.unwrap();
        assert!(procedure.rollback_supported());
        procedure.on_delete_metadata().await.unwrap();
@@ -238,7 +227,7 @@ async fn test_on_rollback() {

    // Drops the logical table
    let task = new_drop_table_task("foo", table_ids[0], false);
-    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = DropTableProcedure::new(task, ddl_context.clone());
    procedure.on_prepare().await.unwrap();
    assert!(!procedure.rollback_supported());
 }
@@ -255,18 +244,15 @@ fn new_drop_table_task(table_name: &str, table_id: TableId, drop_if_exists: bool

 #[tokio::test]
 async fn test_memory_region_keeper_guard_dropped_on_procedure_done() {
-    let cluster_id = 1;
-
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let kv_backend = Arc::new(MemoryKvBackend::new());
    let ddl_context = new_ddl_context_with_kv_backend(node_manager, kv_backend);

-    let physical_table_id = create_physical_table(&ddl_context, cluster_id, "t").await;
-    let logical_table_id =
-        create_logical_table(ddl_context.clone(), cluster_id, physical_table_id, "s").await;
+    let physical_table_id = create_physical_table(&ddl_context, "t").await;
+    let logical_table_id = create_logical_table(ddl_context.clone(), physical_table_id, "s").await;

    let inner_test = |task: DropTableTask| async {
-        let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+        let mut procedure = DropTableProcedure::new(task, ddl_context.clone());
        execute_procedure_until(&mut procedure, |p| {
            p.data.state == DropTableState::InvalidateTableCache
        })
@@ -304,14 +290,13 @@ async fn test_from_json() {
        (DropTableState::DatanodeDropRegions, 1, 1),
        (DropTableState::DeleteTombstone, 1, 0),
    ] {
-        let cluster_id = 1;
        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
        let kv_backend = Arc::new(MemoryKvBackend::new());
        let ddl_context = new_ddl_context_with_kv_backend(node_manager, kv_backend);

-        let physical_table_id = create_physical_table(&ddl_context, cluster_id, "t").await;
+        let physical_table_id = create_physical_table(&ddl_context, "t").await;
        let task = new_drop_table_task("t", physical_table_id, false);
-        let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+        let mut procedure = DropTableProcedure::new(task, ddl_context.clone());
        execute_procedure_until(&mut procedure, |p| p.data.state == state).await;
        let data = procedure.dump().unwrap();
        assert_eq!(
@@ -334,14 +319,13 @@ async fn test_from_json() {

    let num_operating_regions = 0;
    let num_operating_regions_after_recovery = 0;
-    let cluster_id = 1;
    let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
    let kv_backend = Arc::new(MemoryKvBackend::new());
    let ddl_context = new_ddl_context_with_kv_backend(node_manager, kv_backend);

-    let physical_table_id = create_physical_table(&ddl_context, cluster_id, "t").await;
+    let physical_table_id = create_physical_table(&ddl_context, "t").await;
    let task = new_drop_table_task("t", physical_table_id, false);
-    let mut procedure = DropTableProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = DropTableProcedure::new(task, ddl_context.clone());
    execute_procedure_until_done(&mut procedure).await;
    let data = procedure.dump().unwrap();
    assert_eq!(
--- a/src/common/meta/src/ddl/tests/drop_view.rs
+++ b/src/common/meta/src/ddl/tests/drop_view.rs
@@ -41,7 +41,6 @@ fn new_drop_view_task(view: &str, view_id: TableId, drop_if_exists: bool) -> Dro
 async fn test_on_prepare_view_not_exists_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let view_id = 1024;
    let mut task = test_create_view_task("foo");
    task.view_info.ident.table_id = view_id;
@@ -60,7 +59,7 @@ async fn test_on_prepare_view_not_exists_err() {
        .unwrap();

    let task = new_drop_view_task("bar", view_id, false);
-    let mut procedure = DropViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropViewProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_eq!(err.status_code(), StatusCode::TableNotFound);
 }
@@ -69,7 +68,6 @@ async fn test_on_prepare_view_not_exists_err() {
 async fn test_on_prepare_not_view_err() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let view_id = 1024;
    let view_name = "foo";
    let task = test_create_table_task(view_name, view_id);
@@ -85,7 +83,7 @@ async fn test_on_prepare_not_view_err() {
        .unwrap();

    let task = new_drop_view_task(view_name, view_id, false);
-    let mut procedure = DropViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropViewProcedure::new(task, ddl_context);
    // It's not a view, expect error
    let err = procedure.on_prepare().await.unwrap_err();
    assert_eq!(err.status_code(), StatusCode::InvalidArguments);
@@ -95,7 +93,6 @@ async fn test_on_prepare_not_view_err() {
 async fn test_on_prepare_success() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let view_id = 1024;
    let view_name = "foo";
    let mut task = test_create_view_task("foo");
@@ -116,12 +113,12 @@ async fn test_on_prepare_success() {

    let task = new_drop_view_task("bar", view_id, true);
    // Drop if exists
-    let mut procedure = DropViewProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = DropViewProcedure::new(task, ddl_context.clone());
    procedure.on_prepare().await.unwrap();

    let task = new_drop_view_task(view_name, view_id, false);
    // Prepare success
-    let mut procedure = DropViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropViewProcedure::new(task, ddl_context);
    procedure.on_prepare().await.unwrap();
    assert_eq!(DropViewState::DeleteMetadata, procedure.state());
 }
@@ -130,7 +127,6 @@ async fn test_on_prepare_success() {
 async fn test_drop_view_success() {
    let node_manager = Arc::new(MockDatanodeManager::new(()));
    let ddl_context = new_ddl_context(node_manager);
-    let cluster_id = 1;
    let view_id = 1024;
    let view_name = "foo";
    let mut task = test_create_view_task("foo");
@@ -159,7 +155,7 @@ async fn test_drop_view_success() {

    let task = new_drop_view_task(view_name, view_id, false);
    // Prepare success
-    let mut procedure = DropViewProcedure::new(cluster_id, task, ddl_context.clone());
+    let mut procedure = DropViewProcedure::new(task, ddl_context.clone());
    execute_procedure_until_done(&mut procedure).await;
    assert_eq!(DropViewState::InvalidateViewCache, procedure.state());

@@ -174,7 +170,7 @@ async fn test_drop_view_success() {

    // Drop again
    let task = new_drop_view_task(view_name, view_id, false);
-    let mut procedure = DropViewProcedure::new(cluster_id, task, ddl_context);
+    let mut procedure = DropViewProcedure::new(task, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
    assert_eq!(err.status_code(), StatusCode::TableNotFound);
 }
--- a/src/common/meta/src/ddl/truncate_table.rs
+++ b/src/common/meta/src/ddl/truncate_table.rs
@@ -39,9 +39,9 @@ use crate::key::table_info::TableInfoValue;
 use crate::key::table_name::TableNameKey;
 use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
+use crate::metrics;
 use crate::rpc::ddl::TruncateTableTask;
 use crate::rpc::router::{find_leader_regions, find_leaders, RegionRoute};
-use crate::{metrics, ClusterId};

 pub struct TruncateTableProcedure {
    context: DdlContext,
@@ -91,7 +91,6 @@ impl TruncateTableProcedure {
    pub(crate) const TYPE_NAME: &'static str = "metasrv-procedure::TruncateTable";

    pub(crate) fn new(
-        cluster_id: ClusterId,
        task: TruncateTableTask,
        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
        region_routes: Vec<RegionRoute>,
@@ -99,7 +98,7 @@ impl TruncateTableProcedure {
    ) -> Self {
        Self {
            context,
-            data: TruncateTableData::new(cluster_id, task, table_info_value, region_routes),
+            data: TruncateTableData::new(task, table_info_value, region_routes),
        }
    }

@@ -189,7 +188,6 @@ impl TruncateTableProcedure {
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TruncateTableData {
    state: TruncateTableState,
-    cluster_id: ClusterId,
    task: TruncateTableTask,
    table_info_value: DeserializedValueWithBytes<TableInfoValue>,
    region_routes: Vec<RegionRoute>,
@@ -197,14 +195,12 @@ pub struct TruncateTableData {

 impl TruncateTableData {
    pub fn new(
-        cluster_id: ClusterId,
        task: TruncateTableTask,
        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
        region_routes: Vec<RegionRoute>,
    ) -> Self {
        Self {
            state: TruncateTableState::Prepare,
-            cluster_id,
            task,
            table_info_value,
            region_routes,
--- a/src/common/meta/src/ddl/utils.rs
+++ b/src/common/meta/src/ddl/utils.rs
@@ -34,7 +34,6 @@ use crate::key::TableMetadataManagerRef;
 use crate::peer::Peer;
 use crate::rpc::ddl::CreateTableTask;
 use crate::rpc::router::RegionRoute;
-use crate::ClusterId;

 /// Adds [Peer] context if the error is unretryable.
 pub fn add_peer_context_if_needed(datanode: Peer) -> impl FnOnce(Error) -> Error {
@@ -144,7 +143,6 @@ pub async fn get_physical_table_id(

 /// Converts a list of [`RegionRoute`] to a list of [`DetectingRegion`].
 pub fn convert_region_routes_to_detecting_regions(
-    cluster_id: ClusterId,
    region_routes: &[RegionRoute],
 ) -> Vec<DetectingRegion> {
    region_routes
@@ -153,7 +151,7 @@ pub fn convert_region_routes_to_detecting_regions(
            route
                .leader_peer
                .as_ref()
-                .map(|peer| (cluster_id, peer.id, route.region.id))
+                .map(|peer| (peer.id, route.region.id))
        })
        .collect::<Vec<_>>()
 }
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -60,7 +60,6 @@ use crate::rpc::ddl::{
 use crate::rpc::procedure;
 use crate::rpc::procedure::{MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse};
 use crate::rpc::router::RegionRoute;
-use crate::ClusterId;

 pub type DdlManagerRef = Arc<DdlManager>;

@@ -154,13 +153,12 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_alter_table_task(
        &self,
-        cluster_id: ClusterId,
        table_id: TableId,
        alter_table_task: AlterTableTask,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure = AlterTableProcedure::new(cluster_id, table_id, alter_table_task, context)?;
+        let procedure = AlterTableProcedure::new(table_id, alter_table_task, context)?;

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -171,12 +169,11 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_create_table_task(
        &self,
-        cluster_id: ClusterId,
        create_table_task: CreateTableTask,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure = CreateTableProcedure::new(cluster_id, create_table_task, context);
+        let procedure = CreateTableProcedure::new(create_table_task, context);

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -187,12 +184,11 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_create_view_task(
        &self,
-        cluster_id: ClusterId,
        create_view_task: CreateViewTask,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure = CreateViewProcedure::new(cluster_id, create_view_task, context);
+        let procedure = CreateViewProcedure::new(create_view_task, context);

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -203,18 +199,13 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_create_logical_table_tasks(
        &self,
-        cluster_id: ClusterId,
        create_table_tasks: Vec<CreateTableTask>,
        physical_table_id: TableId,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure = CreateLogicalTablesProcedure::new(
-            cluster_id,
-            create_table_tasks,
-            physical_table_id,
-            context,
-        );
+        let procedure =
+            CreateLogicalTablesProcedure::new(create_table_tasks, physical_table_id, context);

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -225,18 +216,13 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_alter_logical_table_tasks(
        &self,
-        cluster_id: ClusterId,
        alter_table_tasks: Vec<AlterTableTask>,
        physical_table_id: TableId,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure = AlterLogicalTablesProcedure::new(
-            cluster_id,
-            alter_table_tasks,
-            physical_table_id,
-            context,
-        );
+        let procedure =
+            AlterLogicalTablesProcedure::new(alter_table_tasks, physical_table_id, context);

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -247,12 +233,11 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_drop_table_task(
        &self,
-        cluster_id: ClusterId,
        drop_table_task: DropTableTask,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

-        let procedure = DropTableProcedure::new(cluster_id, drop_table_task, context);
+        let procedure = DropTableProcedure::new(drop_table_task, context);

        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

@@ -263,7 +248,6 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_create_database(
        &self,
-        _cluster_id: ClusterId,
        CreateDatabaseTask {
            catalog,
            schema,
@@ -283,7 +267,6 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_drop_database(
        &self,
-        _cluster_id: ClusterId,
        DropDatabaseTask {
            catalog,
            schema,
@@ -299,11 +282,10 @@ impl DdlManager {

    pub async fn submit_alter_database(
        &self,
-        cluster_id: ClusterId,
        alter_database_task: AlterDatabaseTask,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();
-        let procedure = AlterDatabaseProcedure::new(cluster_id, alter_database_task, context)?;
+        let procedure = AlterDatabaseProcedure::new(alter_database_task, context)?;
        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

        self.submit_procedure(procedure_with_id).await
@@ -313,12 +295,11 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_create_flow_task(
        &self,
-        cluster_id: ClusterId,
        create_flow: CreateFlowTask,
        query_context: QueryContext,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();
-        let procedure = CreateFlowProcedure::new(cluster_id, create_flow, query_context, context);
+        let procedure = CreateFlowProcedure::new(create_flow, query_context, context);
        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

        self.submit_procedure(procedure_with_id).await
@@ -328,11 +309,10 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_drop_flow_task(
        &self,
-        cluster_id: ClusterId,
        drop_flow: DropFlowTask,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();
-        let procedure = DropFlowProcedure::new(cluster_id, drop_flow, context);
+        let procedure = DropFlowProcedure::new(drop_flow, context);
        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

        self.submit_procedure(procedure_with_id).await
@@ -342,11 +322,10 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_drop_view_task(
        &self,
-        cluster_id: ClusterId,
        drop_view: DropViewTask,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();
-        let procedure = DropViewProcedure::new(cluster_id, drop_view, context);
+        let procedure = DropViewProcedure::new(drop_view, context);
        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));

        self.submit_procedure(procedure_with_id).await
@@ -356,14 +335,12 @@ impl DdlManager {
    #[tracing::instrument(skip_all)]
    pub async fn submit_truncate_table_task(
        &self,
-        cluster_id: ClusterId,
        truncate_table_task: TruncateTableTask,
        table_info_value: DeserializedValueWithBytes<TableInfoValue>,
        region_routes: Vec<RegionRoute>,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();
        let procedure = TruncateTableProcedure::new(
-            cluster_id,
            truncate_table_task,
            table_info_value,
            region_routes,
@@ -397,7 +374,6 @@ impl DdlManager {

 async fn handle_truncate_table_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    truncate_table_task: TruncateTableTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let table_id = truncate_table_task.table_id;
@@ -416,12 +392,7 @@ async fn handle_truncate_table_task(
    let table_route = table_route_value.into_inner().region_routes()?.clone();

    let (id, _) = ddl_manager
-        .submit_truncate_table_task(
-            cluster_id,
-            truncate_table_task,
-            table_info_value,
-            table_route,
-        )
+        .submit_truncate_table_task(truncate_table_task, table_info_value, table_route)
        .await?;

    info!("Table: {table_id} is truncated via procedure_id {id:?}");
@@ -434,7 +405,6 @@ async fn handle_truncate_table_task(

 async fn handle_alter_table_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    alter_table_task: AlterTableTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let table_ref = alter_table_task.table_ref();
@@ -468,7 +438,7 @@ async fn handle_alter_table_task(
    );

    let (id, _) = ddl_manager
-        .submit_alter_table_task(cluster_id, table_id, alter_table_task)
+        .submit_alter_table_task(table_id, alter_table_task)
        .await?;

    info!("Table: {table_id} is altered via procedure_id {id:?}");
@@ -481,13 +451,10 @@ async fn handle_alter_table_task(

 async fn handle_drop_table_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    drop_table_task: DropTableTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let table_id = drop_table_task.table_id;
-    let (id, _) = ddl_manager
-        .submit_drop_table_task(cluster_id, drop_table_task)
-        .await?;
+    let (id, _) = ddl_manager.submit_drop_table_task(drop_table_task).await?;

    info!("Table: {table_id} is dropped via procedure_id {id:?}");

@@ -499,11 +466,10 @@ async fn handle_drop_table_task(

 async fn handle_create_table_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    create_table_task: CreateTableTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, output) = ddl_manager
-        .submit_create_table_task(cluster_id, create_table_task)
+        .submit_create_table_task(create_table_task)
        .await?;

    let procedure_id = id.to_string();
@@ -525,7 +491,6 @@ async fn handle_create_table_task(

 async fn handle_create_logical_table_tasks(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    create_table_tasks: Vec<CreateTableTask>,
 ) -> Result<SubmitDdlTaskResponse> {
    ensure!(
@@ -542,7 +507,7 @@ async fn handle_create_logical_table_tasks(
    let num_logical_tables = create_table_tasks.len();

    let (id, output) = ddl_manager
-        .submit_create_logical_table_tasks(cluster_id, create_table_tasks, physical_table_id)
+        .submit_create_logical_table_tasks(create_table_tasks, physical_table_id)
        .await?;

    info!("{num_logical_tables} logical tables on physical table: {physical_table_id:?} is created via procedure_id {id:?}");
@@ -568,11 +533,10 @@ async fn handle_create_logical_table_tasks(

 async fn handle_create_database_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    create_database_task: CreateDatabaseTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, _) = ddl_manager
-        .submit_create_database(cluster_id, create_database_task.clone())
+        .submit_create_database(create_database_task.clone())
        .await?;

    let procedure_id = id.to_string();
@@ -589,11 +553,10 @@ async fn handle_create_database_task(

 async fn handle_drop_database_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    drop_database_task: DropDatabaseTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, _) = ddl_manager
-        .submit_drop_database(cluster_id, drop_database_task.clone())
+        .submit_drop_database(drop_database_task.clone())
        .await?;

    let procedure_id = id.to_string();
@@ -610,11 +573,10 @@ async fn handle_drop_database_task(

 async fn handle_alter_database_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    alter_database_task: AlterDatabaseTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, _) = ddl_manager
-        .submit_alter_database(cluster_id, alter_database_task.clone())
+        .submit_alter_database(alter_database_task.clone())
        .await?;

    let procedure_id = id.to_string();
@@ -632,11 +594,10 @@ async fn handle_alter_database_task(

 async fn handle_drop_flow_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    drop_flow_task: DropFlowTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, _) = ddl_manager
-        .submit_drop_flow_task(cluster_id, drop_flow_task.clone())
+        .submit_drop_flow_task(drop_flow_task.clone())
        .await?;

    let procedure_id = id.to_string();
@@ -653,11 +614,10 @@ async fn handle_drop_flow_task(

 async fn handle_drop_view_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    drop_view_task: DropViewTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, _) = ddl_manager
-        .submit_drop_view_task(cluster_id, drop_view_task.clone())
+        .submit_drop_view_task(drop_view_task.clone())
        .await?;

    let procedure_id = id.to_string();
@@ -675,12 +635,11 @@ async fn handle_drop_view_task(

 async fn handle_create_flow_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    create_flow_task: CreateFlowTask,
    query_context: QueryContext,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, output) = ddl_manager
-        .submit_create_flow_task(cluster_id, create_flow_task.clone(), query_context)
+        .submit_create_flow_task(create_flow_task.clone(), query_context)
        .await?;

    let procedure_id = id.to_string();
@@ -712,7 +671,6 @@ async fn handle_create_flow_task(

 async fn handle_alter_logical_table_tasks(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    alter_table_tasks: Vec<AlterTableTask>,
 ) -> Result<SubmitDdlTaskResponse> {
    ensure!(
@@ -733,7 +691,7 @@ async fn handle_alter_logical_table_tasks(
    let num_logical_tables = alter_table_tasks.len();

    let (id, _) = ddl_manager
-        .submit_alter_logical_table_tasks(cluster_id, alter_table_tasks, physical_table_id)
+        .submit_alter_logical_table_tasks(alter_table_tasks, physical_table_id)
        .await?;

    info!("{num_logical_tables} logical tables on physical table: {physical_table_id:?} is altered via procedure_id {id:?}");
@@ -749,11 +707,10 @@ async fn handle_alter_logical_table_tasks(
 /// Handle the `[CreateViewTask]` and returns the DDL response when success.
 async fn handle_create_view_task(
    ddl_manager: &DdlManager,
-    cluster_id: ClusterId,
    create_view_task: CreateViewTask,
 ) -> Result<SubmitDdlTaskResponse> {
    let (id, output) = ddl_manager
-        .submit_create_view_task(cluster_id, create_view_task)
+        .submit_create_view_task(create_view_task)
        .await?;

    let procedure_id = id.to_string();
@@ -788,55 +745,43 @@ impl ProcedureExecutor for DdlManager {
            .unwrap_or(TracingContext::from_current_span())
            .attach(tracing::info_span!("DdlManager::submit_ddl_task"));
        async move {
-            let cluster_id = ctx.cluster_id.unwrap_or_default();
            debug!("Submitting Ddl task: {:?}", request.task);
            match request.task {
                CreateTable(create_table_task) => {
-                    handle_create_table_task(self, cluster_id, create_table_task).await
-                }
-                DropTable(drop_table_task) => {
-                    handle_drop_table_task(self, cluster_id, drop_table_task).await
+                    handle_create_table_task(self, create_table_task).await
                }
+                DropTable(drop_table_task) => handle_drop_table_task(self, drop_table_task).await,
                AlterTable(alter_table_task) => {
-                    handle_alter_table_task(self, cluster_id, alter_table_task).await
+                    handle_alter_table_task(self, alter_table_task).await
                }
                TruncateTable(truncate_table_task) => {
-                    handle_truncate_table_task(self, cluster_id, truncate_table_task).await
+                    handle_truncate_table_task(self, truncate_table_task).await
                }
                CreateLogicalTables(create_table_tasks) => {
-                    handle_create_logical_table_tasks(self, cluster_id, create_table_tasks).await
+                    handle_create_logical_table_tasks(self, create_table_tasks).await
                }
                AlterLogicalTables(alter_table_tasks) => {
-                    handle_alter_logical_table_tasks(self, cluster_id, alter_table_tasks).await
+                    handle_alter_logical_table_tasks(self, alter_table_tasks).await
                }
                DropLogicalTables(_) => todo!(),
                CreateDatabase(create_database_task) => {
-                    handle_create_database_task(self, cluster_id, create_database_task).await
+                    handle_create_database_task(self, create_database_task).await
                }
                DropDatabase(drop_database_task) => {
-                    handle_drop_database_task(self, cluster_id, drop_database_task).await
+                    handle_drop_database_task(self, drop_database_task).await
                }
                AlterDatabase(alter_database_task) => {
-                    handle_alter_database_task(self, cluster_id, alter_database_task).await
+                    handle_alter_database_task(self, alter_database_task).await
                }
                CreateFlow(create_flow_task) => {
-                    handle_create_flow_task(
-                        self,
-                        cluster_id,
-                        create_flow_task,
-                        request.query_context.into(),
-                    )
-                    .await
-                }
-                DropFlow(drop_flow_task) => {
-                    handle_drop_flow_task(self, cluster_id, drop_flow_task).await
+                    handle_create_flow_task(self, create_flow_task, request.query_context.into())
+                        .await
                }
+                DropFlow(drop_flow_task) => handle_drop_flow_task(self, drop_flow_task).await,
                CreateView(create_view_task) => {
-                    handle_create_view_task(self, cluster_id, create_view_task).await
-                }
-                DropView(drop_view_task) => {
-                    handle_drop_view_task(self, cluster_id, drop_view_task).await
+                    handle_create_view_task(self, create_view_task).await
                }
+                DropView(drop_view_task) => handle_drop_view_task(self, drop_view_task).await,
            }
        }
        .trace(span)
--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -26,11 +26,10 @@ use crate::flow_name::FlowName;
 use crate::key::schema_name::SchemaName;
 use crate::key::FlowId;
 use crate::peer::Peer;
-use crate::{ClusterId, DatanodeId, FlownodeId};
+use crate::{DatanodeId, FlownodeId};

 #[derive(Eq, Hash, PartialEq, Clone, Debug, Serialize, Deserialize)]
 pub struct RegionIdent {
-    pub cluster_id: ClusterId,
    pub datanode_id: DatanodeId,
    pub table_id: TableId,
    pub region_number: RegionNumber,
@@ -47,8 +46,8 @@ impl Display for RegionIdent {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
-            "RegionIdent(datanode_id='{}.{}', table_id={}, region_number={}, engine = {})",
-            self.cluster_id, self.datanode_id, self.table_id, self.region_number, self.engine
+            "RegionIdent(datanode_id='{}', table_id={}, region_number={}, engine = {})",
+            self.datanode_id, self.table_id, self.region_number, self.engine
        )
    }
 }
@@ -262,7 +261,6 @@ mod tests {
    fn test_serialize_instruction() {
        let open_region = Instruction::OpenRegion(OpenRegion::new(
            RegionIdent {
-                cluster_id: 1,
                datanode_id: 2,
                table_id: 1024,
                region_number: 1,
@@ -277,12 +275,11 @@ mod tests {
        let serialized = serde_json::to_string(&open_region).unwrap();

        assert_eq!(
-            r#"{"OpenRegion":{"region_ident":{"cluster_id":1,"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#,
+            r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#,
            serialized
        );

        let close_region = Instruction::CloseRegion(RegionIdent {
-            cluster_id: 1,
            datanode_id: 2,
            table_id: 1024,
            region_number: 1,
@@ -292,7 +289,7 @@ mod tests {
        let serialized = serde_json::to_string(&close_region).unwrap();

        assert_eq!(
-            r#"{"CloseRegion":{"cluster_id":1,"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#,
+            r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#,
            serialized
        );
    }
@@ -307,7 +304,6 @@ mod tests {
    #[test]
    fn test_compatible_serialize_open_region() {
        let region_ident = RegionIdent {
-            cluster_id: 1,
            datanode_id: 2,
            table_id: 1024,
            region_number: 1,
--- a/src/common/meta/src/lib.rs
+++ b/src/common/meta/src/lib.rs
@@ -47,8 +47,6 @@ pub mod test_util;
 pub mod util;
 pub mod wal_options_allocator;

-// The id of the cluster.
-pub type ClusterId = u64;
 // The id of the datanode.
 pub type DatanodeId = u64;
 // The id of the flownode.
--- a/src/common/meta/src/node_expiry_listener.rs
+++ b/src/common/meta/src/node_expiry_listener.rs
@@ -99,7 +99,7 @@ impl NodeExpiryListener {
        in_memory: &ResettableKvBackendRef,
        max_idle_time: Duration,
    ) -> error::Result<impl Iterator<Item = NodeInfoKey>> {
-        let prefix = NodeInfoKey::key_prefix_with_cluster_id(0);
+        let prefix = NodeInfoKey::key_prefix();
        let req = RangeRequest::new().with_prefix(prefix);
        let current_time_millis = common_time::util::current_time_millis();
        let resp = in_memory.range(req).await?;
--- a/src/common/meta/src/peer.rs
+++ b/src/common/meta/src/peer.rs
@@ -19,7 +19,7 @@ use api::v1::meta::Peer as PbPeer;
 use serde::{Deserialize, Serialize};

 use crate::error::Error;
-use crate::{ClusterId, DatanodeId, FlownodeId};
+use crate::{DatanodeId, FlownodeId};

 #[derive(Debug, Default, Clone, Hash, Eq, PartialEq, Deserialize, Serialize)]
 pub struct Peer {
@@ -72,8 +72,8 @@ impl Display for Peer {
 /// can query peer given a node id
 #[async_trait::async_trait]
 pub trait PeerLookupService {
-    async fn datanode(&self, cluster_id: ClusterId, id: DatanodeId) -> Result<Option<Peer>, Error>;
-    async fn flownode(&self, cluster_id: ClusterId, id: FlownodeId) -> Result<Option<Peer>, Error>;
+    async fn datanode(&self, id: DatanodeId) -> Result<Option<Peer>, Error>;
+    async fn flownode(&self, id: FlownodeId) -> Result<Option<Peer>, Error>;
 }

 pub type PeerLookupServiceRef = Arc<dyn PeerLookupService + Send + Sync>;
--- a/src/common/meta/src/rpc.rs
+++ b/src/common/meta/src/rpc.rs
@@ -31,11 +31,6 @@ impl ResponseHeader {
        self.0.protocol_version
    }

-    #[inline]
-    pub fn cluster_id(&self) -> u64 {
-        self.0.cluster_id
-    }
-
    #[inline]
    pub fn error_code(&self) -> i32 {
        match self.0.error.as_ref() {
@@ -143,7 +138,6 @@ mod tests {
    fn test_response_header_trans() {
        let pb_header = PbResponseHeader {
            protocol_version: 101,
-            cluster_id: 1,
            error: Some(Error {
                code: 100,
                err_msg: "test".to_string(),
@@ -152,7 +146,6 @@ mod tests {

        let header = ResponseHeader(pb_header);
        assert_eq!(101, header.protocol_version());
-        assert_eq!(1, header.cluster_id());
        assert_eq!(100, header.error_code());
        assert_eq!("test".to_string(), header.error_msg());
    }
--- a/src/common/meta/src/test_util.rs
+++ b/src/common/meta/src/test_util.rs
@@ -37,7 +37,7 @@ use crate::peer::{Peer, PeerLookupService};
 use crate::region_keeper::MemoryRegionKeeper;
 use crate::sequence::SequenceBuilder;
 use crate::wal_options_allocator::WalOptionsAllocator;
-use crate::{ClusterId, DatanodeId, FlownodeId};
+use crate::{DatanodeId, FlownodeId};

 #[async_trait::async_trait]
 pub trait MockDatanodeHandler: Sync + Send + Clone {
@@ -189,11 +189,11 @@ pub struct NoopPeerLookupService;

 #[async_trait::async_trait]
 impl PeerLookupService for NoopPeerLookupService {
-    async fn datanode(&self, _cluster_id: ClusterId, id: DatanodeId) -> Result<Option<Peer>> {
+    async fn datanode(&self, id: DatanodeId) -> Result<Option<Peer>> {
        Ok(Some(Peer::empty(id)))
    }

-    async fn flownode(&self, _cluster_id: ClusterId, id: FlownodeId) -> Result<Option<Peer>> {
+    async fn flownode(&self, id: FlownodeId) -> Result<Option<Peer>> {
        Ok(Some(Peer::empty(id)))
    }
 }
--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -235,7 +235,6 @@ mod tests {
        Instruction::CloseRegion(RegionIdent {
            table_id: region_id.table_id(),
            region_number: region_id.region_number(),
-            cluster_id: 1,
            datanode_id: 2,
            engine: MITO_ENGINE_NAME.to_string(),
        })
@@ -246,7 +245,6 @@ mod tests {
            RegionIdent {
                table_id: region_id.table_id(),
                region_number: region_id.region_number(),
-                cluster_id: 1,
                datanode_id: 2,
                engine: MITO_ENGINE_NAME.to_string(),
            },
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -16,6 +16,7 @@ async-trait.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
+chrono.workspace = true
 client.workspace = true
 common-base.workspace = true
 common-config.workspace = true
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -49,12 +49,13 @@ pub(crate) use crate::adapter::node_context::FlownodeContext;
 use crate::adapter::refill::RefillTask;
 use crate::adapter::table_source::ManagedTableSource;
 use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
-pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
+pub(crate) use crate::adapter::worker::{create_worker, WorkerHandle};
 use crate::compute::ErrCollector;
 use crate::df_optimizer::sql_to_flow_plan;
 use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
 use crate::expr::Batch;
 use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
+use crate::recording_rules::RecordingRuleEngine;
 use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};

 mod flownode_impl;
@@ -63,7 +64,7 @@ pub(crate) mod refill;
 mod stat;
 #[cfg(test)]
 mod tests;
-mod util;
+pub(crate) mod util;
 mod worker;

 pub(crate) mod node_context;
@@ -103,7 +104,6 @@ impl Default for FlowConfig {
 #[serde(default)]
 pub struct FlownodeOptions {
    pub mode: Mode,
-    pub cluster_id: Option<u64>,
    pub node_id: Option<u64>,
    pub flow: FlowConfig,
    pub grpc: GrpcOptions,
@@ -118,7 +118,6 @@ impl Default for FlownodeOptions {
    fn default() -> Self {
        Self {
            mode: servers::Mode::Standalone,
-            cluster_id: None,
            node_id: None,
            flow: FlowConfig::default(),
            grpc: GrpcOptions::default().with_bind_addr("127.0.0.1:3004"),
@@ -171,6 +170,8 @@ pub struct FlowWorkerManager {
    flush_lock: RwLock<()>,
    /// receive a oneshot sender to send state size report
    state_report_handler: RwLock<Option<StateReportHandler>>,
+    /// engine for recording rule
+    rule_engine: RecordingRuleEngine,
 }

 /// Building FlownodeManager
@@ -185,6 +186,7 @@ impl FlowWorkerManager {
        node_id: Option<u32>,
        query_engine: Arc<dyn QueryEngine>,
        table_meta: TableMetadataManagerRef,
+        rule_engine: RecordingRuleEngine,
    ) -> Self {
        let srv_map = ManagedTableSource::new(
            table_meta.table_info_manager().clone(),
@@ -207,6 +209,7 @@ impl FlowWorkerManager {
            node_id,
            flush_lock: RwLock::new(()),
            state_report_handler: RwLock::new(None),
+            rule_engine,
        }
    }

@@ -215,25 +218,6 @@ impl FlowWorkerManager {
        self
    }

-    /// Create a flownode manager with one worker
-    pub fn new_with_workers<'s>(
-        node_id: Option<u32>,
-        query_engine: Arc<dyn QueryEngine>,
-        table_meta: TableMetadataManagerRef,
-        num_workers: usize,
-    ) -> (Self, Vec<Worker<'s>>) {
-        let mut zelf = Self::new(node_id, query_engine, table_meta);
-
-        let workers: Vec<_> = (0..num_workers)
-            .map(|_| {
-                let (handle, worker) = create_worker();
-                zelf.add_worker_handle(handle);
-                worker
-            })
-            .collect();
-        (zelf, workers)
-    }
-
    /// add a worker handler to manager, meaning this corresponding worker is under it's manage
    pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
        self.worker_handles.push(handle);
@@ -751,7 +735,11 @@ pub struct CreateFlowArgs {
 /// Create&Remove flow
 impl FlowWorkerManager {
    /// remove a flow by it's id
+    #[allow(unreachable_code)]
    pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+        // TODO(discord9): reroute some back to streaming engine later
+        return self.rule_engine.remove_flow(flow_id).await;
+
        for handle in self.worker_handles.iter() {
            if handle.contains_flow(flow_id).await? {
                handle.remove_flow(flow_id).await?;
@@ -767,8 +755,10 @@ impl FlowWorkerManager {
    /// steps to create task:
    /// 1. parse query into typed plan(and optional parse expire_after expr)
    /// 2. render source/sink with output table id and used input table id
-    #[allow(clippy::too_many_arguments)]
+    #[allow(clippy::too_many_arguments, unreachable_code)]
    pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
+        // TODO(discord9): reroute some back to streaming engine later
+        return self.rule_engine.create_flow(args).await;
        let CreateFlowArgs {
            flow_id,
            sink_table_name,
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -153,7 +153,13 @@ impl Flownode for FlowWorkerManager {
        }
    }

+    #[allow(unreachable_code, unused)]
    async fn handle_inserts(&self, request: InsertRequests) -> Result<FlowResponse> {
+        return self
+            .rule_engine
+            .handle_inserts(request)
+            .await
+            .map_err(to_meta_err(snafu::location!()));
        // using try_read to ensure two things:
        // 1. flush wouldn't happen until inserts before it is inserted
        // 2. inserts happening concurrently with flush wouldn't be block by flush
@@ -206,15 +212,15 @@ impl Flownode for FlowWorkerManager {
                    .collect_vec();
                let table_col_names = table_schema.relation_desc.names;
                let table_col_names = table_col_names
-                    .iter().enumerate()
-                    .map(|(idx,name)| match name {
-                        Some(name) => Ok(name.clone()),
-                        None => InternalSnafu {
-                            reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
-                        }
-                        .fail().map_err(BoxedError::new).context(ExternalSnafu),
-                    })
-                    .collect::<Result<Vec<_>>>()?;
+                        .iter().enumerate()
+                        .map(|(idx,name)| match name {
+                            Some(name) => Ok(name.clone()),
+                            None => InternalSnafu {
+                                reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
+                            }
+                            .fail().map_err(BoxedError::new).context(ExternalSnafu),
+                        })
+                        .collect::<Result<Vec<_>>>()?;
                let name_to_col = HashMap::<_, _>::from_iter(
                    insert_schema
                        .iter()
--- a/src/flow/src/adapter/util.rs
+++ b/src/flow/src/adapter/util.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! Some utility functions
+
 use std::sync::Arc;

 use api::helper::ColumnDataTypeWrapper;
--- a/src/flow/src/error.rs
+++ b/src/flow/src/error.rs
@@ -16,6 +16,7 @@

 use std::any::Any;

+use arrow_schema::ArrowError;
 use common_error::ext::BoxedError;
 use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
 use common_macro::stack_trace_debug;
@@ -53,6 +54,13 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Time error"))]
+    Time {
+        source: common_time::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("External error"))]
    External {
        source: BoxedError,
@@ -156,6 +164,15 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Arrow error: {raw:?} in context: {context}"))]
+    Arrow {
+        #[snafu(source)]
+        raw: ArrowError,
+        context: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
    Datafusion {
        #[snafu(source)]
@@ -230,6 +247,7 @@ impl ErrorExt for Error {
        match self {
            Self::Eval { .. }
            | Self::JoinTask { .. }
+            | Self::Arrow { .. }
            | Self::Datafusion { .. }
            | Self::InsertIntoFlow { .. } => StatusCode::Internal,
            Self::FlowAlreadyExist { .. } => StatusCode::TableAlreadyExists,
@@ -238,7 +256,9 @@ impl ErrorExt for Error {
            | Self::FlowNotFound { .. }
            | Self::ListFlows { .. } => StatusCode::TableNotFound,
            Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
-            Self::InvalidQuery { .. } | Self::CreateFlow { .. } => StatusCode::EngineExecuteQuery,
+            Self::InvalidQuery { .. } | Self::CreateFlow { .. } | Self::Time { .. } => {
+                StatusCode::EngineExecuteQuery
+            }
            Self::Unexpected { .. } => StatusCode::Unexpected,
            Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
                StatusCode::Unsupported
--- a/src/flow/src/expr/utils.rs
+++ b/src/flow/src/expr/utils.rs
@@ -238,6 +238,7 @@ mod test {

        for (sql, current, expected) in &testcases {
            let plan = sql_to_substrait(engine.clone(), sql).await;
+
            let mut ctx = create_test_ctx();
            let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
                .await
--- a/src/flow/src/heartbeat.rs
+++ b/src/flow/src/heartbeat.rs
@@ -130,13 +130,6 @@ impl HeartbeatTask {

    pub fn shutdown(&self) {
        info!("Close heartbeat task for flownode");
-        if self
-            .running
-            .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
-            .is_err()
-        {
-            warn!("Call close heartbeat task multiple times");
-        }
    }

    fn new_heartbeat_request(
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -33,6 +33,7 @@ mod expr;
 pub mod heartbeat;
 mod metrics;
 mod plan;
+mod recording_rules;
 mod repr;
 mod server;
 mod transform;
@@ -43,4 +44,5 @@ mod test_utils;

 pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
 pub use error::{Error, Result};
+pub use recording_rules::FrontendClient;
 pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};
--- a/src/flow/src/metrics.rs
+++ b/src/flow/src/metrics.rs
@@ -28,6 +28,32 @@ lazy_static! {
        &["table_id"]
    )
    .unwrap();
+    pub static ref METRIC_FLOW_RULE_ENGINE_QUERY_TIME: HistogramVec = register_histogram_vec!(
+        "greptime_flow_rule_engine_query_time",
+        "flow rule engine query time",
+        &["flow_id"],
+        vec![
+            0.0,
+            1.,
+            3.,
+            5.,
+            10.,
+            20.,
+            30.,
+            60.,
+            2. * 60.,
+            5. * 60.,
+            10. * 60.
+        ]
+    )
+    .unwrap();
+    pub static ref METRIC_FLOW_RULE_ENGINE_SLOW_QUERY: HistogramVec = register_histogram_vec!(
+        "greptime_flow_rule_engine_slow_query",
+        "flow rule engine slow query",
+        &["flow_id", "sql", "peer"],
+        vec![60., 2. * 60., 3. * 60., 5. * 60., 10. * 60.]
+    )
+    .unwrap();
    pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
        register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
    pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
--- a/src/flow/src/recording_rules.rs
+++ b/src/flow/src/recording_rules.rs
@@ -0,0 +1,940 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Run flow as recording rule which is time-window-aware normal query triggered every tick set by user
+
+mod engine;
+mod frontend_client;
+
+use std::collections::BTreeSet;
+use std::sync::Arc;
+
+use api::helper::pb_value_to_value_ref;
+use catalog::CatalogManagerRef;
+use common_error::ext::BoxedError;
+use common_recordbatch::DfRecordBatch;
+use common_telemetry::warn;
+use common_time::timestamp::TimeUnit;
+use common_time::Timestamp;
+use datafusion::error::Result as DfResult;
+use datafusion::logical_expr::Expr;
+use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
+use datafusion::prelude::SessionContext;
+use datafusion::sql::unparser::Unparser;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter};
+use datafusion_common::{DFSchema, TableReference};
+use datafusion_expr::{ColumnarValue, LogicalPlan};
+use datafusion_physical_expr::PhysicalExprRef;
+use datatypes::prelude::{ConcreteDataType, DataType};
+use datatypes::scalars::ScalarVector;
+use datatypes::schema::TIME_INDEX_KEY;
+use datatypes::value::Value;
+use datatypes::vectors::{
+    TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
+    TimestampSecondVector, Vector,
+};
+pub use engine::RecordingRuleEngine;
+pub use frontend_client::FrontendClient;
+use itertools::Itertools;
+use query::parser::QueryLanguageParser;
+use query::QueryEngineRef;
+use session::context::QueryContextRef;
+use snafu::{ensure, OptionExt, ResultExt};
+
+use crate::adapter::util::from_proto_to_data_type;
+use crate::df_optimizer::apply_df_optimizer;
+use crate::error::{ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, UnexpectedSnafu};
+use crate::expr::error::DataTypeSnafu;
+use crate::Error;
+
+#[derive(Debug, Clone)]
+pub struct TimeWindowExpr {
+    phy_expr: PhysicalExprRef,
+    column_name: String,
+    logical_expr: Expr,
+    df_schema: DFSchema,
+}
+
+impl TimeWindowExpr {
+    pub fn from_expr(expr: &Expr, column_name: &str, df_schema: &DFSchema) -> Result<Self, Error> {
+        let phy_planner = DefaultPhysicalPlanner::default();
+
+        let phy_expr: PhysicalExprRef = phy_planner
+            .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+            .with_context(|_e| DatafusionSnafu {
+                context: format!(
+                    "Failed to create physical expression from {expr:?} using {df_schema:?}"
+                ),
+            })?;
+        Ok(Self {
+            phy_expr,
+            column_name: column_name.to_string(),
+            logical_expr: expr.clone(),
+            df_schema: df_schema.clone(),
+        })
+    }
+
+    pub fn eval(
+        &self,
+        current: Timestamp,
+    ) -> Result<(Option<Timestamp>, Option<Timestamp>), Error> {
+        let lower_bound =
+            find_expr_time_window_lower_bound(&self.logical_expr, &self.df_schema, current)?;
+        let upper_bound =
+            find_expr_time_window_upper_bound(&self.logical_expr, &self.df_schema, current)?;
+        Ok((lower_bound, upper_bound))
+    }
+
+    /// Find timestamps from rows using time window expr
+    pub async fn handle_rows(
+        &self,
+        rows_list: Vec<api::v1::Rows>,
+    ) -> Result<BTreeSet<Timestamp>, Error> {
+        let mut time_windows = BTreeSet::new();
+
+        for rows in rows_list {
+            // pick the time index column and use it to eval on `self.expr`
+            let ts_col_index = rows
+                .schema
+                .iter()
+                .map(|col| col.column_name.clone())
+                .position(|name| name == self.column_name);
+            let Some(ts_col_index) = ts_col_index else {
+                warn!("can't found time index column in schema: {:?}", rows.schema);
+                continue;
+            };
+            let col_schema = &rows.schema[ts_col_index];
+            let cdt = from_proto_to_data_type(col_schema)?;
+
+            let column_values = rows
+                .rows
+                .iter()
+                .map(|row| &row.values[ts_col_index])
+                .collect_vec();
+
+            let mut vector = cdt.create_mutable_vector(column_values.len());
+            for value in column_values {
+                let value = pb_value_to_value_ref(value, &None);
+                vector.try_push_value_ref(value).context(DataTypeSnafu {
+                    msg: "Failed to convert rows to columns",
+                })?;
+            }
+            let vector = vector.to_vector();
+
+            let df_schema = create_df_schema_for_ts_column(&self.column_name, cdt)?;
+
+            let rb =
+                DfRecordBatch::try_new(df_schema.inner().clone(), vec![vector.to_arrow_array()])
+                    .with_context(|_e| ArrowSnafu {
+                        context: format!(
+                            "Failed to create record batch from {df_schema:?} and {vector:?}"
+                        ),
+                    })?;
+
+            let eval_res = self
+                .phy_expr
+                .evaluate(&rb)
+                .with_context(|_| DatafusionSnafu {
+                    context: format!(
+                        "Failed to evaluate physical expression {:?} on {rb:?}",
+                        self.phy_expr
+                    ),
+                })?;
+
+            let res = columnar_to_ts_vector(&eval_res)?;
+
+            for ts in res.into_iter().flatten() {
+                time_windows.insert(ts);
+            }
+        }
+
+        Ok(time_windows)
+    }
+}
+
+fn create_df_schema_for_ts_column(name: &str, cdt: ConcreteDataType) -> Result<DFSchema, Error> {
+    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+        name,
+        cdt.as_arrow_type(),
+        false,
+    )]));
+
+    let df_schema = DFSchema::from_field_specific_qualified_schema(
+        vec![Some(TableReference::bare("TimeIndexOnlyTable"))],
+        &arrow_schema,
+    )
+    .with_context(|_e| DatafusionSnafu {
+        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
+    })?;
+
+    Ok(df_schema)
+}
+
+/// Convert `ColumnarValue` to `Vec<Option<Timestamp>>`
+fn columnar_to_ts_vector(columnar: &ColumnarValue) -> Result<Vec<Option<Timestamp>>, Error> {
+    let val = match columnar {
+        datafusion_expr::ColumnarValue::Array(array) => {
+            let ty = array.data_type();
+            let ty = ConcreteDataType::from_arrow_type(ty);
+            let time_unit = if let ConcreteDataType::Timestamp(ty) = ty {
+                ty.unit()
+            } else {
+                return UnexpectedSnafu {
+                    reason: format!("Non-timestamp type: {ty:?}"),
+                }
+                .fail();
+            };
+
+            match time_unit {
+                TimeUnit::Second => TimestampSecondVector::try_from_arrow_array(array.clone())
+                    .with_context(|_| DatatypesSnafu {
+                        extra: format!("Failed to create vector from arrow array {array:?}"),
+                    })?
+                    .iter_data()
+                    .map(|d| d.map(|d| d.0))
+                    .collect_vec(),
+                TimeUnit::Millisecond => {
+                    TimestampMillisecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+                TimeUnit::Microsecond => {
+                    TimestampMicrosecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+                TimeUnit::Nanosecond => {
+                    TimestampNanosecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+            }
+        }
+        datafusion_expr::ColumnarValue::Scalar(scalar) => {
+            let value = Value::try_from(scalar.clone()).with_context(|_| DatatypesSnafu {
+                extra: format!("Failed to convert scalar {scalar:?} to value"),
+            })?;
+            let ts = value.as_timestamp().context(UnexpectedSnafu {
+                reason: format!("Expect Timestamp, found {:?}", value),
+            })?;
+            vec![Some(ts)]
+        }
+    };
+    Ok(val)
+}
+
+/// Convert sql to datafusion logical plan
+pub async fn sql_to_df_plan(
+    query_ctx: QueryContextRef,
+    engine: QueryEngineRef,
+    sql: &str,
+    optimize: bool,
+) -> Result<LogicalPlan, Error> {
+    let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?;
+    let plan = engine
+        .planner()
+        .plan(&stmt, query_ctx)
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?;
+    let plan = if optimize {
+        apply_df_optimizer(plan).await?
+    } else {
+        plan
+    };
+    Ok(plan)
+}
+
+/// Return (the column name of time index column, the time window expr, the expected time unit of time index column, the expr's schema for evaluating the time window)
+async fn find_time_window_expr(
+    plan: &LogicalPlan,
+    catalog_man: CatalogManagerRef,
+    query_ctx: QueryContextRef,
+) -> Result<(String, Option<datafusion_expr::Expr>, TimeUnit, DFSchema), Error> {
+    // TODO(discord9): find the expr that do time window
+
+    let mut table_name = None;
+
+    // first find the table source in the logical plan
+    plan.apply(|plan| {
+        let LogicalPlan::TableScan(table_scan) = plan else {
+            return Ok(TreeNodeRecursion::Continue);
+        };
+        table_name = Some(table_scan.table_name.clone());
+        Ok(TreeNodeRecursion::Stop)
+    })
+    .with_context(|_| DatafusionSnafu {
+        context: format!("Can't find table source in plan {plan:?}"),
+    })?;
+    let Some(table_name) = table_name else {
+        UnexpectedSnafu {
+            reason: format!("Can't find table source in plan {plan:?}"),
+        }
+        .fail()?
+    };
+
+    let current_schema = query_ctx.current_schema();
+
+    let catalog_name = table_name.catalog().unwrap_or(query_ctx.current_catalog());
+    let schema_name = table_name.schema().unwrap_or(&current_schema);
+    let table_name = table_name.table();
+
+    let Some(table_ref) = catalog_man
+        .table(catalog_name, schema_name, table_name, Some(&query_ctx))
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?
+    else {
+        UnexpectedSnafu {
+            reason: format!(
+                "Can't find table {table_name:?} in catalog {catalog_name:?}/{schema_name:?}"
+            ),
+        }
+        .fail()?
+    };
+
+    let schema = &table_ref.table_info().meta.schema;
+
+    let ts_index = schema.timestamp_column().context(UnexpectedSnafu {
+        reason: format!("Can't find timestamp column in table {table_name:?}"),
+    })?;
+
+    let ts_col_name = ts_index.name.clone();
+
+    let expected_time_unit = ts_index.data_type.as_timestamp().with_context(|| UnexpectedSnafu {
+        reason: format!(
+            "Expected timestamp column {ts_col_name:?} in table {table_name:?} to be timestamp, but got {ts_index:?}"
+        ),
+    })?.unit();
+
+    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+        ts_col_name.clone(),
+        ts_index.data_type.as_arrow_type(),
+        false,
+    )]));
+
+    let df_schema = DFSchema::from_field_specific_qualified_schema(
+        vec![Some(TableReference::bare(table_name))],
+        &arrow_schema,
+    )
+    .with_context(|_e| DatafusionSnafu {
+        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
+    })?;
+
+    // find the time window expr which refers to the time index column
+    let mut aggr_expr = None;
+    let mut time_window_expr: Option<Expr> = None;
+
+    let find_inner_aggr_expr = |plan: &LogicalPlan| {
+        if let LogicalPlan::Aggregate(aggregate) = plan {
+            aggr_expr = Some(aggregate.clone());
+        };
+
+        Ok(TreeNodeRecursion::Continue)
+    };
+    plan.apply(find_inner_aggr_expr)
+        .with_context(|_| DatafusionSnafu {
+            context: format!("Can't find aggr expr in plan {plan:?}"),
+        })?;
+
+    if let Some(aggregate) = aggr_expr {
+        for group_expr in &aggregate.group_expr {
+            let refs = group_expr.column_refs();
+            if refs.len() != 1 {
+                continue;
+            }
+            let ref_col = refs.iter().next().unwrap();
+
+            let index = aggregate.input.schema().maybe_index_of_column(ref_col);
+            let Some(index) = index else {
+                continue;
+            };
+            let field = aggregate.input.schema().field(index);
+
+            let is_time_index = field.metadata().get(TIME_INDEX_KEY) == Some(&"true".to_string());
+
+            if is_time_index {
+                let rewrite_column = group_expr.clone();
+                let rewritten = rewrite_column
+                    .rewrite(&mut RewriteColumn {
+                        table_name: table_name.to_string(),
+                    })
+                    .with_context(|_| DatafusionSnafu {
+                        context: format!("Rewrite expr failed, expr={:?}", group_expr),
+                    })?
+                    .data;
+                struct RewriteColumn {
+                    table_name: String,
+                }
+
+                impl TreeNodeRewriter for RewriteColumn {
+                    type Node = Expr;
+                    fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+                        let Expr::Column(mut column) = node else {
+                            return Ok(Transformed::no(node));
+                        };
+
+                        column.relation = Some(TableReference::bare(self.table_name.clone()));
+
+                        Ok(Transformed::yes(Expr::Column(column)))
+                    }
+                }
+
+                time_window_expr = Some(rewritten);
+                break;
+            }
+        }
+        Ok((ts_col_name, time_window_expr, expected_time_unit, df_schema))
+    } else {
+        // can't found time window expr, return None
+        Ok((ts_col_name, None, expected_time_unit, df_schema))
+    }
+}
+
+/// Find nearest lower bound for time `current` in given `plan` for the time window expr.
+/// i.e. for time window expr being `date_bin(INTERVAL '5 minutes', ts) as time_window` and `current="2021-07-01 00:01:01.000"`,
+/// return `Some("2021-07-01 00:00:00.000")`
+/// if `plan` doesn't contain a `TIME INDEX` column, return `None`
+///
+/// Time window expr is a expr that:
+/// 1. ref only to a time index column
+/// 2. is monotonic increasing
+/// 3. show up in GROUP BY clause
+///
+/// note this plan should only contain one TableScan
+pub async fn find_plan_time_window_bound(
+    plan: &LogicalPlan,
+    current: Timestamp,
+    query_ctx: QueryContextRef,
+    engine: QueryEngineRef,
+) -> Result<(String, Option<Timestamp>, Option<Timestamp>), Error> {
+    // TODO(discord9): find the expr that do time window
+    let catalog_man = engine.engine_state().catalog_manager();
+
+    let (ts_col_name, time_window_expr, expected_time_unit, df_schema) =
+        find_time_window_expr(plan, catalog_man.clone(), query_ctx).await?;
+    // cast current to ts_index's type
+    let new_current = current
+        .convert_to(expected_time_unit)
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Failed to cast current timestamp {current:?} to {expected_time_unit}"),
+        })?;
+
+    // if no time_window_expr is found, return None
+    if let Some(time_window_expr) = time_window_expr {
+        let lower_bound =
+            find_expr_time_window_lower_bound(&time_window_expr, &df_schema, new_current)?;
+        let upper_bound =
+            find_expr_time_window_upper_bound(&time_window_expr, &df_schema, new_current)?;
+        Ok((ts_col_name, lower_bound, upper_bound))
+    } else {
+        Ok((ts_col_name, None, None))
+    }
+}
+
+/// Find the lower bound of time window in given `expr` and `current` timestamp.
+///
+/// i.e. for `current="2021-07-01 00:01:01.000"` and `expr=date_bin(INTERVAL '5 minutes', ts) as time_window` and `ts_col=ts`,
+/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
+/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
+/// of current time window given the current timestamp
+///
+/// if return None, meaning this time window have no lower bound
+fn find_expr_time_window_lower_bound(
+    expr: &Expr,
+    df_schema: &DFSchema,
+    current: Timestamp,
+) -> Result<Option<Timestamp>, Error> {
+    let phy_planner = DefaultPhysicalPlanner::default();
+
+    let phy_expr: PhysicalExprRef = phy_planner
+        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+        .with_context(|_e| DatafusionSnafu {
+            context: format!(
+                "Failed to create physical expression from {expr:?} using {df_schema:?}"
+            ),
+        })?;
+
+    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
+    let input_time_unit = cur_time_window.unit();
+    Ok(cur_time_window.convert_to(input_time_unit))
+}
+
+/// Find the upper bound for time window expression
+fn find_expr_time_window_upper_bound(
+    expr: &Expr,
+    df_schema: &DFSchema,
+    current: Timestamp,
+) -> Result<Option<Timestamp>, Error> {
+    use std::cmp::Ordering;
+
+    let phy_planner = DefaultPhysicalPlanner::default();
+
+    let phy_expr: PhysicalExprRef = phy_planner
+        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+        .with_context(|_e| DatafusionSnafu {
+            context: format!(
+                "Failed to create physical expression from {expr:?} using {df_schema:?}"
+            ),
+        })?;
+
+    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
+
+    // search to find the lower bound
+    let mut offset: i64 = 1;
+    let mut lower_bound = Some(current);
+    let upper_bound;
+    // first expontial probe to found a range for binary search
+    loop {
+        let Some(next_val) = current.value().checked_add(offset) else {
+            // no upper bound if overflow
+            return Ok(None);
+        };
+
+        let next_time_probe = common_time::Timestamp::new(next_val, current.unit());
+
+        let next_time_window = eval_ts_to_ts(&phy_expr, df_schema, next_time_probe)?;
+
+        match next_time_window.cmp(&cur_time_window) {
+            Ordering::Less => {UnexpectedSnafu {
+                reason: format!(
+                    "Unsupported time window expression, expect monotonic increasing for time window expression {expr:?}"
+                ),
+            }
+            .fail()?
+            }
+            Ordering::Equal => {
+                lower_bound = Some(next_time_probe);
+            }
+            Ordering::Greater => {
+                upper_bound = Some(next_time_probe);
+                break
+            }
+        }
+
+        let Some(new_offset) = offset.checked_mul(2) else {
+            // no upper bound if overflow
+            return Ok(None);
+        };
+        offset = new_offset;
+    }
+
+    // binary search for the exact upper bound
+
+    ensure!(lower_bound.map(|v|v.unit())==upper_bound.map(|v|v.unit()), UnexpectedSnafu{
+        reason: format!(" unit mismatch for time window expression {expr:?}, found {lower_bound:?} and {upper_bound:?}"),
+    });
+
+    let output_unit = upper_bound
+        .context(UnexpectedSnafu {
+            reason: "should have lower bound",
+        })?
+        .unit();
+
+    let mut low = lower_bound
+        .context(UnexpectedSnafu {
+            reason: "should have lower bound",
+        })?
+        .value();
+    let mut high = upper_bound
+        .context(UnexpectedSnafu {
+            reason: "should have upper bound",
+        })?
+        .value();
+    while low < high {
+        let mid = (low + high) / 2;
+        let mid_probe = common_time::Timestamp::new(mid, output_unit);
+        let mid_time_window = eval_ts_to_ts(&phy_expr, df_schema, mid_probe)?;
+
+        match mid_time_window.cmp(&cur_time_window) {
+            Ordering::Less => UnexpectedSnafu {
+                reason: format!("Binary search failed for time window expression {expr:?}"),
+            }
+            .fail()?,
+            Ordering::Equal => low = mid + 1,
+            Ordering::Greater => high = mid,
+        }
+    }
+
+    let final_upper_bound_for_time_window = common_time::Timestamp::new(high, output_unit);
+
+    Ok(Some(final_upper_bound_for_time_window))
+}
+
+fn eval_ts_to_ts(
+    phy: &PhysicalExprRef,
+    df_schema: &DFSchema,
+    input_value: Timestamp,
+) -> Result<Timestamp, Error> {
+    let schema_ty = df_schema.field(0).data_type();
+    let schema_cdt = ConcreteDataType::from_arrow_type(schema_ty);
+    let schema_unit = if let ConcreteDataType::Timestamp(ts) = schema_cdt {
+        ts.unit()
+    } else {
+        return UnexpectedSnafu {
+            reason: format!("Expect Timestamp, found {:?}", schema_cdt),
+        }
+        .fail();
+    };
+    let input_value = input_value
+        .convert_to(schema_unit)
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Failed to convert timestamp {input_value:?} to {schema_unit}"),
+        })?;
+    let ts_vector = match schema_unit {
+        TimeUnit::Second => {
+            TimestampSecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Millisecond => {
+            TimestampMillisecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Microsecond => {
+            TimestampMicrosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Nanosecond => {
+            TimestampNanosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+    };
+
+    let rb = DfRecordBatch::try_new(df_schema.inner().clone(), vec![ts_vector.clone()])
+        .with_context(|_| ArrowSnafu {
+            context: format!("Failed to create record batch from {df_schema:?} and {ts_vector:?}"),
+        })?;
+
+    let eval_res = phy.evaluate(&rb).with_context(|_| DatafusionSnafu {
+        context: format!("Failed to evaluate physical expression {phy:?} on {rb:?}"),
+    })?;
+
+    if let Some(Some(ts)) = columnar_to_ts_vector(&eval_res)?.first() {
+        Ok(*ts)
+    } else {
+        UnexpectedSnafu {
+            reason: format!(
+                "Expected timestamp in expression {phy:?} but got {:?}",
+                eval_res
+            ),
+        }
+        .fail()?
+    }
+}
+
+// TODO(discord9): a method to found out the precise time window
+
+/// Find out the `Filter` Node corresponding to outermost `WHERE` and add a new filter expr to it
+#[derive(Debug)]
+pub struct AddFilterRewriter {
+    extra_filter: Expr,
+    is_rewritten: bool,
+}
+
+impl AddFilterRewriter {
+    fn new(filter: Expr) -> Self {
+        Self {
+            extra_filter: filter,
+            is_rewritten: false,
+        }
+    }
+}
+
+impl TreeNodeRewriter for AddFilterRewriter {
+    type Node = LogicalPlan;
+    fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+        if self.is_rewritten {
+            return Ok(Transformed::no(node));
+        }
+        match node {
+            LogicalPlan::Filter(mut filter) if !filter.having => {
+                filter.predicate = filter.predicate.and(self.extra_filter.clone());
+                self.is_rewritten = true;
+                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
+            }
+            LogicalPlan::TableScan(_) => {
+                // add a new filter
+                let filter =
+                    datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
+                self.is_rewritten = true;
+                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
+            }
+            _ => Ok(Transformed::no(node)),
+        }
+    }
+}
+
+fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
+    /// A dialect that forces all identifiers to be quoted
+    struct ForceQuoteIdentifiers;
+    impl datafusion::sql::unparser::dialect::Dialect for ForceQuoteIdentifiers {
+        fn identifier_quote_style(&self, identifier: &str) -> Option<char> {
+            if identifier.to_lowercase() != identifier {
+                Some('"')
+            } else {
+                None
+            }
+        }
+    }
+    let unparser = Unparser::new(&ForceQuoteIdentifiers);
+    // first make all column qualified
+    let sql = unparser
+        .plan_to_sql(plan)
+        .with_context(|_e| DatafusionSnafu {
+            context: format!("Failed to unparse logical plan {plan:?}"),
+        })?;
+    Ok(sql.to_string())
+}
+
+#[cfg(test)]
+mod test {
+    use datafusion_common::tree_node::TreeNode;
+    use pretty_assertions::assert_eq;
+    use session::context::QueryContext;
+
+    use super::{sql_to_df_plan, *};
+    use crate::recording_rules::{df_plan_to_sql, AddFilterRewriter};
+    use crate::test_utils::create_test_query_engine;
+
+    #[tokio::test]
+    async fn test_sql_plan_convert() {
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+        let old = r#"SELECT "NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#;
+        let new = sql_to_df_plan(ctx.clone(), query_engine.clone(), old, false)
+            .await
+            .unwrap();
+        let new_sql = df_plan_to_sql(&new).unwrap();
+
+        assert_eq!(
+            r#"SELECT "UPPERCASE_NUMBERS_WITH_TS"."NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#,
+            new_sql
+        );
+    }
+
+    #[tokio::test]
+    async fn test_add_filter() {
+        let testcases = vec![
+            (
+                "SELECT number FROM numbers_with_ts GROUP BY number","SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
+            ),
+            (
+                "SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
+                "SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
+            ),
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            )
+        ];
+        use datafusion_expr::{col, lit};
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+
+        for (before, after) in testcases {
+            let sql = before;
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+                .await
+                .unwrap();
+
+            let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
+            let plan = plan.rewrite(&mut add_filter).unwrap().data;
+            let new_sql = df_plan_to_sql(&plan).unwrap();
+            assert_eq!(after, new_sql);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_time_window_lower_bound() {
+        use datafusion_expr::{col, lit};
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+
+        let testcases = [
+            // same alias is not same column
+            (
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
+                Timestamp::new(1740394109, TimeUnit::Second),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(1740394109000, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(1740394109001, TimeUnit::Millisecond)),
+                ),
+                r#"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:29' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:48:29.001' AS TIMESTAMP))) GROUP BY numbers_with_ts.ts"#
+            ),
+            // complex time window index
+            (
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(1740394109, TimeUnit::Second),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(1740394080, TimeUnit::Second)),
+                    Some(Timestamp::new(1740394140, TimeUnit::Second)),
+                ),
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:00' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:49:00' AS TIMESTAMP))) GROUP BY arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)')"
+            ),
+            // no time index
+            (
+                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                ("ts".to_string(), None, None),
+                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;"
+            ),
+            // time index
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // on spot
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(0, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // different time unit
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23_000_000, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // time index with other fields
+            (
+                "SELECT sum(number) as sum_up, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(numbers_with_ts.number) AS sum_up, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // time index with other pks
+            (
+                "SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number"
+            ),
+            // subquery
+            (
+                "SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
+            ),
+            // cte
+            (
+                "with cte as (select number, date_bin('5 minutes', ts) as time_window from numbers_with_ts GROUP BY time_window, number) select number, time_window from cte;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT cte.number, cte.time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number) AS cte"
+            ),
+            // complex subquery without alias
+            (
+                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
+            ),
+            // complex subquery alias
+            (
+                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) AS cte GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
+            ),
+        ];
+
+        for (sql, current, expected, expected_unparsed) in testcases {
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, true)
+                .await
+                .unwrap();
+
+            let real =
+                find_plan_time_window_bound(&plan, current, ctx.clone(), query_engine.clone())
+                    .await
+                    .unwrap();
+            assert_eq!(expected, real);
+
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+                .await
+                .unwrap();
+            let (col_name, lower, upper) = real;
+            let new_sql = if lower.is_some() {
+                let to_df_literal = |value| {
+                    let value = Value::from(value);
+
+                    value.try_to_scalar_value(&value.data_type()).unwrap()
+                };
+                let lower = to_df_literal(lower.unwrap());
+                let upper = to_df_literal(upper.unwrap());
+                let expr = col(&col_name)
+                    .gt_eq(lit(lower))
+                    .and(col(&col_name).lt_eq(lit(upper)));
+                let mut add_filter = AddFilterRewriter::new(expr);
+                let plan = plan.rewrite(&mut add_filter).unwrap().data;
+                df_plan_to_sql(&plan).unwrap()
+            } else {
+                sql.to_string()
+            };
+            assert_eq!(expected_unparsed, new_sql);
+        }
+    }
+}
--- a/src/flow/src/recording_rules/engine.rs
+++ b/src/flow/src/recording_rules/engine.rs
@@ -0,0 +1,815 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::sync::Arc;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+
+use api::v1::flow::FlowResponse;
+use common_error::ext::BoxedError;
+use common_meta::ddl::create_flow::FlowType;
+use common_meta::key::flow::FlowMetadataManagerRef;
+use common_meta::key::table_info::TableInfoManager;
+use common_meta::key::TableMetadataManagerRef;
+use common_telemetry::tracing::warn;
+use common_telemetry::{debug, info};
+use common_time::Timestamp;
+use datafusion::sql::unparser::expr_to_sql;
+use datafusion_common::tree_node::TreeNode;
+use datatypes::value::Value;
+use query::QueryEngineRef;
+use session::context::QueryContextRef;
+use snafu::{ensure, OptionExt, ResultExt};
+use store_api::storage::RegionId;
+use table::metadata::TableId;
+use tokio::sync::oneshot::error::TryRecvError;
+use tokio::sync::{oneshot, RwLock};
+use tokio::time::Instant;
+
+use super::frontend_client::FrontendClient;
+use super::{df_plan_to_sql, AddFilterRewriter, TimeWindowExpr};
+use crate::adapter::{CreateFlowArgs, FlowId, TableName};
+use crate::error::{
+    DatafusionSnafu, DatatypesSnafu, ExternalSnafu, FlowAlreadyExistSnafu, InternalSnafu,
+    TimeSnafu, UnexpectedSnafu,
+};
+use crate::metrics::{METRIC_FLOW_RULE_ENGINE_QUERY_TIME, METRIC_FLOW_RULE_ENGINE_SLOW_QUERY};
+use crate::recording_rules::{find_time_window_expr, sql_to_df_plan};
+use crate::Error;
+
+/// TODO(discord9): make those constants configurable
+/// The default rule engine query timeout is 10 minutes
+pub const DEFAULT_RULE_ENGINE_QUERY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
+
+/// will output a warn log for any query that runs for more that 1 minutes, and also every 1 minutes when that query is still running
+pub const SLOW_QUERY_THRESHOLD: Duration = Duration::from_secs(60);
+
+/// TODO(discord9): determine how to configure refresh rate
+pub struct RecordingRuleEngine {
+    tasks: RwLock<BTreeMap<FlowId, RecordingRuleTask>>,
+    shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
+    frontend_client: Arc<FrontendClient>,
+    flow_metadata_manager: FlowMetadataManagerRef,
+    table_meta: TableMetadataManagerRef,
+    engine: QueryEngineRef,
+}
+
+impl RecordingRuleEngine {
+    pub fn new(
+        frontend_client: Arc<FrontendClient>,
+        engine: QueryEngineRef,
+        flow_metadata_manager: FlowMetadataManagerRef,
+        table_meta: TableMetadataManagerRef,
+    ) -> Self {
+        Self {
+            tasks: Default::default(),
+            shutdown_txs: Default::default(),
+            frontend_client,
+            flow_metadata_manager,
+            table_meta,
+            engine,
+        }
+    }
+
+    pub async fn handle_inserts(
+        &self,
+        request: api::v1::region::InsertRequests,
+    ) -> Result<FlowResponse, Error> {
+        let table_info_mgr = self.table_meta.table_info_manager();
+        let mut group_by_table_name: HashMap<TableName, Vec<api::v1::Rows>> = HashMap::new();
+        for r in request.requests {
+            let tid = RegionId::from(r.region_id).table_id();
+            let name = get_table_name(table_info_mgr, &tid).await?;
+            let entry = group_by_table_name.entry(name).or_default();
+            if let Some(rows) = r.rows {
+                entry.push(rows);
+            }
+        }
+
+        for (_flow_id, task) in self.tasks.read().await.iter() {
+            let src_table_names = &task.source_table_names;
+
+            for src_table_name in src_table_names {
+                if let Some(entry) = group_by_table_name.get(src_table_name) {
+                    let Some(expr) = &task.time_window_expr else {
+                        continue;
+                    };
+                    let involved_time_windows = expr.handle_rows(entry.clone()).await?;
+                    let mut state = task.state.write().await;
+                    state
+                        .dirty_time_windows
+                        .add_lower_bounds(involved_time_windows.into_iter());
+                }
+            }
+        }
+
+        Ok(Default::default())
+    }
+}
+
+async fn get_table_name(zelf: &TableInfoManager, table_id: &TableId) -> Result<TableName, Error> {
+    zelf.get(*table_id)
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Table id = {:?}, couldn't found table name", table_id),
+        })
+        .map(|name| name.table_name())
+        .map(|name| [name.catalog_name, name.schema_name, name.table_name])
+}
+
+const MIN_REFRESH_DURATION: Duration = Duration::new(5, 0);
+
+impl RecordingRuleEngine {
+    pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
+        let CreateFlowArgs {
+            flow_id,
+            sink_table_name,
+            source_table_ids,
+            create_if_not_exists,
+            or_replace,
+            expire_after,
+            comment: _,
+            sql,
+            flow_options,
+            query_ctx,
+        } = args;
+
+        // or replace logic
+        {
+            let is_exist = self.tasks.read().await.contains_key(&flow_id);
+            match (create_if_not_exists, or_replace, is_exist) {
+                // if replace, ignore that old flow exists
+                (_, true, true) => {
+                    info!("Replacing flow with id={}", flow_id);
+                }
+                (false, false, true) => FlowAlreadyExistSnafu { id: flow_id }.fail()?,
+                // already exists, and not replace, return None
+                (true, false, true) => {
+                    info!("Flow with id={} already exists, do nothing", flow_id);
+                    return Ok(None);
+                }
+
+                // continue as normal
+                (_, _, false) => (),
+            }
+        }
+
+        let flow_type = flow_options.get(FlowType::FLOW_TYPE_KEY);
+
+        ensure!(
+            flow_type == Some(&FlowType::RecordingRule.to_string()) || flow_type.is_none(),
+            UnexpectedSnafu {
+                reason: format!("Flow type is not RecordingRule nor None, got {flow_type:?}")
+            }
+        );
+
+        let Some(query_ctx) = query_ctx else {
+            UnexpectedSnafu {
+                reason: "Query context is None".to_string(),
+            }
+            .fail()?
+        };
+        let query_ctx = Arc::new(query_ctx);
+        let mut source_table_names = Vec::new();
+        for src_id in source_table_ids {
+            let table_name = self
+                .table_meta
+                .table_info_manager()
+                .get(src_id)
+                .await
+                .map_err(BoxedError::new)
+                .context(ExternalSnafu)?
+                .with_context(|| UnexpectedSnafu {
+                    reason: format!("Table id = {:?}, couldn't found table name", src_id),
+                })
+                .map(|name| name.table_name())
+                .map(|name| [name.catalog_name, name.schema_name, name.table_name])?;
+            source_table_names.push(table_name);
+        }
+
+        let (tx, rx) = oneshot::channel();
+
+        let plan = sql_to_df_plan(query_ctx.clone(), self.engine.clone(), &sql, true).await?;
+        let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
+            &plan,
+            self.engine.engine_state().catalog_manager().clone(),
+            query_ctx.clone(),
+        )
+        .await?;
+
+        let phy_expr = time_window_expr
+            .map(|expr| TimeWindowExpr::from_expr(&expr, &column_name, &df_schema))
+            .transpose()?;
+
+        info!("Flow id={}, found time window expr={:?}", flow_id, phy_expr);
+
+        let task = RecordingRuleTask::new(
+            flow_id,
+            &sql,
+            phy_expr,
+            expire_after,
+            sink_table_name,
+            source_table_names,
+            query_ctx,
+            rx,
+        );
+
+        let task_inner = task.clone();
+        let engine = self.engine.clone();
+        let frontend = self.frontend_client.clone();
+
+        // TODO(discord9): also save handle & use time wheel or what for better
+        let _handle = common_runtime::spawn_global(async move {
+            match task_inner.start_executing(engine, frontend).await {
+                Ok(()) => info!("Flow {} shutdown", task_inner.flow_id),
+                Err(err) => common_telemetry::error!(
+                    "Flow {} encounter unrecoverable error: {err:?}",
+                    task_inner.flow_id
+                ),
+            }
+        });
+
+        // TODO(discord9): deal with replace logic
+        let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
+        drop(replaced_old_task_opt);
+
+        self.shutdown_txs.write().await.insert(flow_id, tx);
+
+        Ok(Some(flow_id))
+    }
+
+    pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+        if self.tasks.write().await.remove(&flow_id).is_none() {
+            warn!("Flow {flow_id} not found in tasks")
+        }
+        let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
+            UnexpectedSnafu {
+                reason: format!("Can't found shutdown tx for flow {flow_id}"),
+            }
+            .fail()?
+        };
+        if tx.send(()).is_err() {
+            warn!("Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?")
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct RecordingRuleTask {
+    pub flow_id: FlowId,
+    query: String,
+    pub time_window_expr: Option<TimeWindowExpr>,
+    /// in seconds
+    pub expire_after: Option<i64>,
+    sink_table_name: [String; 3],
+    source_table_names: HashSet<[String; 3]>,
+    state: Arc<RwLock<RecordingRuleState>>,
+}
+
+impl RecordingRuleTask {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        flow_id: FlowId,
+        query: &str,
+        time_window_expr: Option<TimeWindowExpr>,
+        expire_after: Option<i64>,
+        sink_table_name: [String; 3],
+        source_table_names: Vec<[String; 3]>,
+        query_ctx: QueryContextRef,
+        shutdown_rx: oneshot::Receiver<()>,
+    ) -> Self {
+        Self {
+            flow_id,
+            query: query.to_string(),
+            time_window_expr,
+            expire_after,
+            sink_table_name,
+            source_table_names: source_table_names.into_iter().collect(),
+            state: Arc::new(RwLock::new(RecordingRuleState::new(query_ctx, shutdown_rx))),
+        }
+    }
+}
+impl RecordingRuleTask {
+    /// This should be called in a new tokio task
+    pub async fn start_executing(
+        &self,
+        engine: QueryEngineRef,
+        frontend_client: Arc<FrontendClient>,
+    ) -> Result<(), Error> {
+        // only first query don't need upper bound
+        let mut is_first = true;
+
+        loop {
+            // FIXME(discord9): test if need upper bound also works
+            let new_query = self.gen_query_with_time_window(engine.clone()).await?;
+
+            let insert_into = if let Some(new_query) = new_query {
+                format!(
+                    "INSERT INTO {}.{}.{} {}",
+                    self.sink_table_name[0],
+                    self.sink_table_name[1],
+                    self.sink_table_name[2],
+                    new_query
+                )
+            } else {
+                tokio::time::sleep(MIN_REFRESH_DURATION).await;
+                continue;
+            };
+
+            if is_first {
+                is_first = false;
+            }
+
+            let instant = Instant::now();
+            let flow_id = self.flow_id;
+            let db_client = frontend_client.get_database_client().await?;
+            let peer_addr = db_client.peer.addr;
+            debug!(
+                "Executing flow {flow_id}(expire_after={:?} secs) on {:?} with query {}",
+                self.expire_after, peer_addr, &insert_into
+            );
+
+            let timer = METRIC_FLOW_RULE_ENGINE_QUERY_TIME
+                .with_label_values(&[flow_id.to_string().as_str()])
+                .start_timer();
+
+            let res = db_client.database.sql(&insert_into).await;
+            drop(timer);
+
+            let elapsed = instant.elapsed();
+            if let Ok(res1) = &res {
+                debug!(
+                    "Flow {flow_id} executed, result: {res1:?}, elapsed: {:?}",
+                    elapsed
+                );
+            } else if let Err(res) = &res {
+                warn!(
+                    "Failed to execute Flow {flow_id} on frontend {}, result: {res:?}, elapsed: {:?} with query: {}",
+                    peer_addr, elapsed, &insert_into
+                );
+            }
+
+            // record slow query
+            if elapsed >= SLOW_QUERY_THRESHOLD {
+                warn!(
+                    "Flow {flow_id} on frontend {} executed for {:?} before complete, query: {}",
+                    peer_addr, elapsed, &insert_into
+                );
+                METRIC_FLOW_RULE_ENGINE_SLOW_QUERY
+                    .with_label_values(&[flow_id.to_string().as_str(), &insert_into, &peer_addr])
+                    .observe(elapsed.as_secs_f64());
+            }
+
+            self.state
+                .write()
+                .await
+                .after_query_exec(elapsed, res.is_ok());
+            // drop the result to free client-related resources
+            drop(res);
+
+            let sleep_until = {
+                let mut state = self.state.write().await;
+                match state.shutdown_rx.try_recv() {
+                    Ok(()) => break Ok(()),
+                    Err(TryRecvError::Closed) => {
+                        warn!("Unexpected shutdown flow {flow_id}, shutdown anyway");
+                        break Ok(());
+                    }
+                    Err(TryRecvError::Empty) => (),
+                }
+                state.get_next_start_query_time(None)
+            };
+            tokio::time::sleep_until(sleep_until).await;
+        }
+    }
+
+    /// will merge and use the first ten time window in query
+    async fn gen_query_with_time_window(
+        &self,
+        engine: QueryEngineRef,
+    ) -> Result<Option<String>, Error> {
+        let query_ctx = self.state.read().await.query_ctx.clone();
+        let start = SystemTime::now();
+        let since_the_epoch = start
+            .duration_since(UNIX_EPOCH)
+            .expect("Time went backwards");
+        let low_bound = self
+            .expire_after
+            .map(|e| since_the_epoch.as_secs() - e as u64)
+            .unwrap_or(u64::MIN);
+
+        let low_bound = Timestamp::new_second(low_bound as i64);
+
+        // TODO(discord9): use time window expr to get the precise expire lower bound
+        let expire_time_window_bound = self
+            .time_window_expr
+            .as_ref()
+            .map(|expr| expr.eval(low_bound))
+            .transpose()?;
+
+        let new_sql = {
+            let expr = {
+                match expire_time_window_bound {
+                    Some((Some(l), Some(u))) => {
+                        let window_size = u.sub(&l).with_context(|| UnexpectedSnafu {
+                            reason: format!("Can't get window size from {u:?} - {l:?}"),
+                        })?;
+                        let col_name = self
+                            .time_window_expr
+                            .as_ref()
+                            .map(|expr| expr.column_name.clone())
+                            .with_context(|| UnexpectedSnafu {
+                                reason: format!(
+                                    "Flow id={:?}, Failed to get column name from time window expr",
+                                    self.flow_id
+                                ),
+                            })?;
+
+                        self.state
+                            .write()
+                            .await
+                            .dirty_time_windows
+                            .gen_filter_exprs(&col_name, Some(l), window_size, self)?
+                    }
+                    _ => {
+                        debug!(
+                            "Flow id = {:?}, can't get window size: precise_lower_bound={expire_time_window_bound:?}, using the same query", self.flow_id
+                        );
+                        // since no time window lower/upper bound is found, just return the original query
+                        return Ok(Some(self.query.clone()));
+                    }
+                }
+            };
+
+            debug!(
+                "Flow id={:?}, Generated filter expr: {:?}",
+                self.flow_id,
+                expr.as_ref()
+                    .map(|expr| expr_to_sql(expr).with_context(|_| DatafusionSnafu {
+                        context: format!("Failed to generate filter expr from {expr:?}"),
+                    }))
+                    .transpose()?
+                    .map(|s| s.to_string())
+            );
+
+            let Some(expr) = expr else {
+                // no new data, hence no need to update
+                debug!("Flow id={:?}, no new data, not update", self.flow_id);
+                return Ok(None);
+            };
+
+            let mut add_filter = AddFilterRewriter::new(expr);
+            // make a not optimized plan for clearer unparse
+            let plan =
+                sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.query, false).await?;
+            let plan = plan
+                .clone()
+                .rewrite(&mut add_filter)
+                .with_context(|_| DatafusionSnafu {
+                    context: format!("Failed to rewrite plan {plan:?}"),
+                })?
+                .data;
+            df_plan_to_sql(&plan)?
+        };
+
+        Ok(Some(new_sql))
+    }
+}
+
+#[derive(Debug)]
+pub struct RecordingRuleState {
+    query_ctx: QueryContextRef,
+    /// last query complete time
+    last_update_time: Instant,
+    /// last time query duration
+    last_query_duration: Duration,
+    /// Dirty Time windows need to be updated
+    /// mapping of `start -> end` and non-overlapping
+    dirty_time_windows: DirtyTimeWindows,
+    exec_state: ExecState,
+    shutdown_rx: oneshot::Receiver<()>,
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct DirtyTimeWindows {
+    windows: BTreeMap<Timestamp, Option<Timestamp>>,
+}
+
+fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
+    let value = Value::from(value);
+    let value = value
+        .try_to_scalar_value(&value.data_type())
+        .with_context(|_| DatatypesSnafu {
+            extra: format!("Failed to convert to scalar value: {}", value),
+        })?;
+    Ok(value)
+}
+
+impl DirtyTimeWindows {
+    /// Time window merge distance
+    const MERGE_DIST: i32 = 3;
+
+    /// Maximum number of filters allowed in a single query
+    const MAX_FILTER_NUM: usize = 20;
+
+    /// Add lower bounds to the dirty time windows. Upper bounds are ignored.
+    ///
+    /// # Arguments
+    ///
+    /// * `lower_bounds` - An iterator of lower bounds to be added.
+    pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
+        for lower_bound in lower_bounds {
+            let entry = self.windows.entry(lower_bound);
+            entry.or_insert(None);
+        }
+    }
+
+    /// Generate all filter expressions consuming all time windows
+    pub fn gen_filter_exprs(
+        &mut self,
+        col_name: &str,
+        expire_lower_bound: Option<Timestamp>,
+        window_size: chrono::Duration,
+        task_ctx: &RecordingRuleTask,
+    ) -> Result<Option<datafusion_expr::Expr>, Error> {
+        debug!(
+            "expire_lower_bound: {:?}, window_size: {:?}",
+            expire_lower_bound.map(|t| t.to_iso8601_string()),
+            window_size
+        );
+        self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
+
+        if self.windows.len() > Self::MAX_FILTER_NUM {
+            let first_time_window = self.windows.first_key_value();
+            let last_time_window = self.windows.last_key_value();
+            warn!(
+                "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
+                task_ctx.flow_id,
+                self.windows.len(),
+                Self::MAX_FILTER_NUM,
+                task_ctx.time_window_expr,
+                task_ctx.expire_after,
+                first_time_window,
+                last_time_window,
+                task_ctx.query
+            );
+        }
+
+        // get the first `MAX_FILTER_NUM` time windows
+        let nth = self
+            .windows
+            .iter()
+            .nth(Self::MAX_FILTER_NUM)
+            .map(|(key, _)| *key);
+        let first_nth = {
+            if let Some(nth) = nth {
+                let mut after = self.windows.split_off(&nth);
+                std::mem::swap(&mut self.windows, &mut after);
+
+                after
+            } else {
+                std::mem::take(&mut self.windows)
+            }
+        };
+
+        let mut expr_lst = vec![];
+        for (start, end) in first_nth.into_iter() {
+            debug!(
+                "Time window start: {:?}, end: {:?}",
+                start.to_iso8601_string(),
+                end.map(|t| t.to_iso8601_string())
+            );
+
+            use datafusion_expr::{col, lit};
+            let lower = to_df_literal(start)?;
+            let upper = end.map(to_df_literal).transpose()?;
+            let expr = if let Some(upper) = upper {
+                col(col_name)
+                    .gt_eq(lit(lower))
+                    .and(col(col_name).lt(lit(upper)))
+            } else {
+                col(col_name).gt_eq(lit(lower))
+            };
+            expr_lst.push(expr);
+        }
+        let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
+        Ok(expr)
+    }
+
+    /// Merge time windows that overlaps or get too close
+    pub fn merge_dirty_time_windows(
+        &mut self,
+        window_size: chrono::Duration,
+        expire_lower_bound: Option<Timestamp>,
+    ) -> Result<(), Error> {
+        let mut new_windows = BTreeMap::new();
+
+        let mut prev_tw = None;
+        for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
+            // filter out expired time window
+            if let Some(expire_lower_bound) = expire_lower_bound {
+                if lower_bound <= expire_lower_bound {
+                    continue;
+                }
+            }
+
+            let Some(prev_tw) = &mut prev_tw else {
+                prev_tw = Some((lower_bound, upper_bound));
+                continue;
+            };
+
+            let std_window_size = window_size.to_std().map_err(|e| {
+                InternalSnafu {
+                    reason: e.to_string(),
+                }
+                .build()
+            })?;
+
+            // if cur.lower - prev.upper <= window_size * 2, merge
+            let prev_upper = prev_tw
+                .1
+                .unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
+            prev_tw.1 = Some(prev_upper);
+
+            let cur_upper = upper_bound.unwrap_or(
+                lower_bound
+                    .add_duration(std_window_size)
+                    .context(TimeSnafu)?,
+            );
+
+            if lower_bound
+                .sub(&prev_upper)
+                .map(|dist| dist <= window_size * Self::MERGE_DIST)
+                .unwrap_or(false)
+            {
+                prev_tw.1 = Some(cur_upper);
+            } else {
+                new_windows.insert(prev_tw.0, prev_tw.1);
+                *prev_tw = (lower_bound, Some(cur_upper));
+            }
+        }
+
+        if let Some(prev_tw) = prev_tw {
+            new_windows.insert(prev_tw.0, prev_tw.1);
+        }
+
+        self.windows = new_windows;
+
+        Ok(())
+    }
+}
+
+impl RecordingRuleState {
+    pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
+        Self {
+            query_ctx,
+            last_update_time: Instant::now(),
+            last_query_duration: Duration::from_secs(0),
+            dirty_time_windows: Default::default(),
+            exec_state: ExecState::Idle,
+            shutdown_rx,
+        }
+    }
+
+    /// called after last query is done
+    /// `is_succ` indicate whether the last query is successful
+    pub fn after_query_exec(&mut self, elapsed: Duration, _is_succ: bool) {
+        self.exec_state = ExecState::Idle;
+        self.last_query_duration = elapsed;
+        self.last_update_time = Instant::now();
+    }
+
+    /// wait for at least `last_query_duration`, at most `max_timeout` to start next query
+    pub fn get_next_start_query_time(&self, max_timeout: Option<Duration>) -> Instant {
+        let next_duration = max_timeout
+            .unwrap_or(self.last_query_duration)
+            .min(self.last_query_duration);
+        let next_duration = next_duration.max(MIN_REFRESH_DURATION);
+
+        self.last_update_time + next_duration
+    }
+}
+
+#[derive(Debug, Clone)]
+enum ExecState {
+    Idle,
+    Executing,
+}
+
+#[cfg(test)]
+mod test {
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_merge_dirty_time_windows() {
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
+            .unwrap();
+        // just enough to merge
+        assert_eq!(
+            dirty.windows,
+            BTreeMap::from([(
+                Timestamp::new_second(0),
+                Some(Timestamp::new_second(
+                    (2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
+                ))
+            )])
+        );
+
+        // separate time window
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
+            .unwrap();
+        // just enough to merge
+        assert_eq!(
+            BTreeMap::from([
+                (
+                    Timestamp::new_second(0),
+                    Some(Timestamp::new_second(5 * 60))
+                ),
+                (
+                    Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+                    Some(Timestamp::new_second(
+                        (3 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
+                    ))
+                )
+            ]),
+            dirty.windows
+        );
+
+        // overlapping
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
+            .unwrap();
+        // just enough to merge
+        assert_eq!(
+            BTreeMap::from([(
+                Timestamp::new_second(0),
+                Some(Timestamp::new_second(
+                    (1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
+                ))
+            ),]),
+            dirty.windows
+        );
+
+        // expired
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(
+                chrono::Duration::seconds(5 * 60),
+                Some(Timestamp::new_second(
+                    (DirtyTimeWindows::MERGE_DIST as i64) * 6 * 60,
+                )),
+            )
+            .unwrap();
+        // just enough to merge
+        assert_eq!(BTreeMap::from([]), dirty.windows);
+    }
+}
--- a/src/flow/src/recording_rules/frontend_client.rs
+++ b/src/flow/src/recording_rules/frontend_client.rs
@@ -0,0 +1,163 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Frontend client to run flow as recording rule which is time-window-aware normal query triggered every tick set by user
+
+use std::sync::Arc;
+
+use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_error::ext::BoxedError;
+use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
+use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
+use common_meta::peer::Peer;
+use common_meta::rpc::store::RangeRequest;
+use meta_client::client::MetaClient;
+use snafu::ResultExt;
+
+use crate::error::{ExternalSnafu, UnexpectedSnafu};
+use crate::recording_rules::engine::DEFAULT_RULE_ENGINE_QUERY_TIMEOUT;
+use crate::Error;
+
+fn default_channel_mgr() -> ChannelManager {
+    let cfg = ChannelConfig::new().timeout(DEFAULT_RULE_ENGINE_QUERY_TIMEOUT);
+    ChannelManager::with_config(cfg)
+}
+
+fn client_from_urls(addrs: Vec<String>) -> Client {
+    Client::with_manager_and_urls(default_channel_mgr(), addrs)
+}
+
+/// A simple frontend client able to execute sql using grpc protocol
+#[derive(Debug)]
+pub enum FrontendClient {
+    Distributed {
+        meta_client: Arc<MetaClient>,
+        channel_mgr: ChannelManager,
+    },
+    Standalone {
+        /// for the sake of simplicity still use grpc even in standalone mode
+        /// notice the client here should all be lazy, so that can wait after frontend is booted then make conn
+        /// TODO(discord9): not use grpc under standalone mode
+        database_client: DatabaseWithPeer,
+    },
+}
+
+#[derive(Debug, Clone)]
+pub struct DatabaseWithPeer {
+    pub database: Database,
+    pub peer: Peer,
+}
+
+impl DatabaseWithPeer {
+    fn new(database: Database, peer: Peer) -> Self {
+        Self { database, peer }
+    }
+}
+
+impl FrontendClient {
+    pub fn from_meta_client(meta_client: Arc<MetaClient>) -> Self {
+        Self::Distributed {
+            meta_client,
+            channel_mgr: default_channel_mgr(),
+        }
+    }
+
+    pub fn from_static_grpc_addr(addr: String) -> Self {
+        let peer = Peer {
+            id: 0,
+            addr: addr.clone(),
+        };
+
+        let mgr = default_channel_mgr();
+        let client = Client::with_manager_and_urls(mgr.clone(), vec![addr]);
+        let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        Self::Standalone {
+            database_client: DatabaseWithPeer::new(database, peer),
+        }
+    }
+}
+
+impl FrontendClient {
+    async fn scan_for_frontend(&self) -> Result<Vec<(NodeInfoKey, NodeInfo)>, Error> {
+        let Self::Distributed { meta_client, .. } = self else {
+            return Ok(vec![]);
+        };
+        let cluster_client = meta_client
+            .cluster_client()
+            .map_err(BoxedError::new)
+            .context(ExternalSnafu)?;
+
+        let prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend);
+        let req = RangeRequest::new().with_prefix(prefix);
+        let resp = cluster_client
+            .range(req)
+            .await
+            .map_err(BoxedError::new)
+            .context(ExternalSnafu)?;
+        let mut res = Vec::with_capacity(resp.kvs.len());
+        for kv in resp.kvs {
+            let key = NodeInfoKey::try_from(kv.key)
+                .map_err(BoxedError::new)
+                .context(ExternalSnafu)?;
+
+            let val = NodeInfo::try_from(kv.value)
+                .map_err(BoxedError::new)
+                .context(ExternalSnafu)?;
+            res.push((key, val));
+        }
+        Ok(res)
+    }
+
+    /// Get the database with max `last_activity_ts`
+    async fn get_last_active_frontend(&self) -> Result<DatabaseWithPeer, Error> {
+        if let Self::Standalone { database_client } = self {
+            return Ok(database_client.clone());
+        }
+        match &self {
+            Self::Standalone { database_client } => Ok(database_client.clone()),
+            Self::Distributed {
+                meta_client: _,
+                channel_mgr,
+            } => {
+                let frontends = self.scan_for_frontend().await?;
+                let mut last_activity_ts = i64::MIN;
+                let mut peer = None;
+                for (_key, val) in frontends.iter() {
+                    if val.last_activity_ts > last_activity_ts {
+                        last_activity_ts = val.last_activity_ts;
+                        peer = Some(val.peer.clone());
+                    }
+                }
+                let Some(peer) = peer else {
+                    UnexpectedSnafu {
+                        reason: format!("No frontend available: {:?}", frontends),
+                    }
+                    .fail()?
+                };
+                let client =
+                    Client::with_manager_and_urls(channel_mgr.clone(), vec![peer.addr.clone()]);
+                let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+                Ok(DatabaseWithPeer::new(database, peer))
+            }
+        }
+    }
+
+    /// Get a database client, and possibly update it before returning.
+    pub async fn get_database_client(&self) -> Result<DatabaseWithPeer, Error> {
+        match self {
+            Self::Standalone { database_client } => Ok(database_client.clone()),
+            Self::Distributed { meta_client: _, .. } => self.get_last_active_frontend().await,
+        }
+    }
+}
--- a/src/flow/src/server.rs
+++ b/src/flow/src/server.rs
@@ -57,6 +57,7 @@ use crate::error::{
 };
 use crate::heartbeat::HeartbeatTask;
 use crate::metrics::{METRIC_FLOW_PROCESSING_TIME, METRIC_FLOW_ROWS};
+use crate::recording_rules::{FrontendClient, RecordingRuleEngine};
 use crate::transform::register_function_to_query_engine;
 use crate::utils::{SizeReportSender, StateReportHandler};
 use crate::{Error, FlowWorkerManager, FlownodeOptions};
@@ -245,6 +246,7 @@ impl FlownodeInstance {
        self.server.shutdown().await.context(ShutdownServerSnafu)?;

        if let Some(task) = &self.heartbeat_task {
+            info!("Close heartbeat task for flownode");
            task.shutdown();
        }

@@ -271,6 +273,8 @@ pub struct FlownodeBuilder {
    heartbeat_task: Option<HeartbeatTask>,
    /// receive a oneshot sender to send state size report
    state_report_handler: Option<StateReportHandler>,
+    /// Client to send sql to frontend
+    frontend_client: Arc<FrontendClient>,
 }

 impl FlownodeBuilder {
@@ -281,6 +285,7 @@ impl FlownodeBuilder {
        table_meta: TableMetadataManagerRef,
        catalog_manager: CatalogManagerRef,
        flow_metadata_manager: FlowMetadataManagerRef,
+        frontend_client: Arc<FrontendClient>,
    ) -> Self {
        Self {
            opts,
@@ -290,6 +295,7 @@ impl FlownodeBuilder {
            flow_metadata_manager,
            heartbeat_task: None,
            state_report_handler: None,
+            frontend_client,
        }
    }

@@ -447,7 +453,14 @@ impl FlownodeBuilder {

        let node_id = self.opts.node_id.map(|id| id as u32);

-        let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta);
+        let rule_engine = RecordingRuleEngine::new(
+            self.frontend_client.clone(),
+            query_engine.clone(),
+            self.flow_metadata_manager.clone(),
+            table_meta.clone(),
+        );
+
+        let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta, rule_engine);
        for worker_id in 0..num_workers {
            let (tx, rx) = oneshot::channel();

--- a/src/flow/src/test_utils.rs
+++ b/src/flow/src/test_utils.rs
@@ -86,7 +86,8 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {

    let schema = vec![
        datatypes::schema::ColumnSchema::new("number", CDT::uint32_datatype(), false),
-        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false),
+        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
    ];
    let mut columns = vec![];
    let numbers = (1..=10).collect_vec();
@@ -114,6 +115,37 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
    };
    catalog_list.register_table_sync(req_with_ts).unwrap();

+    let schema = vec![
+        datatypes::schema::ColumnSchema::new("NUMBER", CDT::uint32_datatype(), false),
+        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ];
+    let mut columns = vec![];
+    let numbers = (1..=10).collect_vec();
+    let column: VectorRef = Arc::new(<u32 as Scalar>::VectorType::from_vec(numbers));
+    columns.push(column);
+
+    let ts = (1..=10).collect_vec();
+    let mut builder = TimestampMillisecondVectorBuilder::with_capacity(10);
+    ts.into_iter()
+        .map(|v| builder.push(Some(TimestampMillisecond::new(v))))
+        .count();
+    let column: VectorRef = builder.to_vector_cloned();
+    columns.push(column);
+
+    let schema = Arc::new(Schema::new(schema));
+    let recordbatch = common_recordbatch::RecordBatch::new(schema, columns).unwrap();
+    let table = MemTable::table("UPPERCASE_NUMBERS_WITH_TS", recordbatch);
+
+    let req_with_ts = RegisterTableRequest {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table_name: "UPPERCASE_NUMBERS_WITH_TS".to_string(),
+        table_id: 1025,
+        table,
+    };
+    catalog_list.register_table_sync(req_with_ts).unwrap();
+
    let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);

    let engine = factory.query_engine();
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -238,6 +238,13 @@ pub enum Error {
        source: servers::error::Error,
    },

+    #[snafu(display("Failed to create logical plan for prometheus label values query"))]
+    PrometheusLabelValuesQueryPlan {
+        #[snafu(implicit)]
+        location: Location,
+        source: query::promql::error::Error,
+    },
+
    #[snafu(display("Failed to describe schema for given statement"))]
    DescribeStatement {
        #[snafu(implicit)]
@@ -366,6 +373,8 @@ impl ErrorExt for Error {
            | Error::PrometheusMetricNamesQueryPlan { source, .. }
            | Error::ExecutePromql { source, .. } => source.status_code(),

+            Error::PrometheusLabelValuesQueryPlan { source, .. } => source.status_code(),
+
            Error::CollectRecordbatch { .. } => StatusCode::EngineExecuteQuery,

            Error::SqlExecIntercepted { source, .. } => source.status_code(),
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -26,6 +26,7 @@ mod region_query;
 pub mod standalone;

 use std::sync::Arc;
+use std::time::SystemTime;

 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
@@ -471,6 +472,21 @@ impl PrometheusHandler for Instance {
            .context(ExecuteQuerySnafu)
    }

+    async fn query_label_values(
+        &self,
+        metric: String,
+        label_name: String,
+        matchers: Vec<Matcher>,
+        start: SystemTime,
+        end: SystemTime,
+        ctx: &QueryContextRef,
+    ) -> server_error::Result<Vec<String>> {
+        self.handle_query_label_values(metric, label_name, matchers, start, end, ctx)
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)
+    }
+
    fn catalog_manager(&self) -> CatalogManagerRef {
        self.catalog_manager.clone()
    }
--- a/src/frontend/src/instance/jaeger.rs
+++ b/src/frontend/src/instance/jaeger.rs
@@ -35,11 +35,10 @@ use servers::error::{
    CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, Result as ServerResult,
    TableNotFoundSnafu,
 };
-use servers::http::jaeger::QueryTraceParams;
+use servers::http::jaeger::{QueryTraceParams, FIND_TRACES_COLS};
 use servers::otlp::trace::{
-    DURATION_NANO_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_ID_COLUMN,
-    SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, TIMESTAMP_COLUMN, TRACE_ID_COLUMN,
-    TRACE_TABLE_NAME,
+    DURATION_NANO_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_KIND_COLUMN,
+    SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, TIMESTAMP_COLUMN, TRACE_ID_COLUMN, TRACE_TABLE_NAME,
 };
 use servers::query_handler::JaegerQueryHandler;
 use session::context::QueryContextRef;
@@ -102,16 +101,9 @@ impl JaegerQueryHandler for Instance {
    }

    async fn get_trace(&self, ctx: QueryContextRef, trace_id: &str) -> ServerResult<Output> {
-        // It's equivalent to `SELECT trace_id, timestamp, duration_nano, service_name, span_name, span_id, span_attributes FROM {db}.{trace_table} WHERE trace_id = '{trace_id}'`.
-        let selects = vec![
-            col(TRACE_ID_COLUMN),
-            col(TIMESTAMP_COLUMN),
-            col(DURATION_NANO_COLUMN),
-            col(SERVICE_NAME_COLUMN),
-            col(SPAN_NAME_COLUMN),
-            col(SPAN_ID_COLUMN),
-            col(SPAN_ATTRIBUTES_COLUMN),
-        ];
+        // It's equivalent to `SELECT trace_id, timestamp, duration_nano, service_name, span_name, span_id, span_attributes, resource_attributes, parent_span_id
+        // FROM {db}.{trace_table} WHERE trace_id = '{trace_id}'`.
+        let selects: Vec<Expr> = FIND_TRACES_COLS.clone();

        let filters = vec![col(TRACE_ID_COLUMN).eq(lit(trace_id))];

@@ -133,15 +125,7 @@ impl JaegerQueryHandler for Instance {
        ctx: QueryContextRef,
        query_params: QueryTraceParams,
    ) -> ServerResult<Output> {
-        let selects = vec![
-            col(TRACE_ID_COLUMN),
-            col(TIMESTAMP_COLUMN),
-            col(DURATION_NANO_COLUMN),
-            col(SERVICE_NAME_COLUMN),
-            col(SPAN_NAME_COLUMN),
-            col(SPAN_ID_COLUMN),
-            col(SPAN_ATTRIBUTES_COLUMN),
-        ];
+        let selects: Vec<Expr> = FIND_TRACES_COLS.clone();

        let mut filters = vec![];

--- a/src/frontend/src/instance/promql.rs
+++ b/src/frontend/src/instance/promql.rs
@@ -12,20 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::time::SystemTime;
+
 use catalog::information_schema::TABLES;
 use client::OutputData;
 use common_catalog::consts::INFORMATION_SCHEMA_NAME;
+use common_catalog::format_full_table_name;
 use common_recordbatch::util;
 use common_telemetry::tracing;
 use datatypes::prelude::Value;
-use promql_parser::label::Matcher;
+use promql_parser::label::{Matcher, Matchers};
+use query::promql;
+use query::promql::planner::PromPlanner;
 use servers::prometheus;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};

 use crate::error::{
    CatalogSnafu, CollectRecordbatchSnafu, ExecLogicalPlanSnafu,
-    PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu, Result, TableNotFoundSnafu,
+    PrometheusLabelValuesQueryPlanSnafu, PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu,
+    Result, TableNotFoundSnafu,
 };
 use crate::instance::Instance;

@@ -96,4 +102,77 @@ impl Instance {

        Ok(results)
    }
+
+    /// Handles label values query request, returns the values.
+    #[tracing::instrument(skip_all)]
+    pub(crate) async fn handle_query_label_values(
+        &self,
+        metric: String,
+        label_name: String,
+        matchers: Vec<Matcher>,
+        start: SystemTime,
+        end: SystemTime,
+        ctx: &QueryContextRef,
+    ) -> Result<Vec<String>> {
+        let table_schema = ctx.current_schema();
+        let table = self
+            .catalog_manager
+            .table(ctx.current_catalog(), &table_schema, &metric, Some(ctx))
+            .await
+            .context(CatalogSnafu)?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
+            })?;
+
+        let dataframe = self
+            .query_engine
+            .read_table(table.clone())
+            .with_context(|_| ReadTableSnafu {
+                table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
+            })?;
+
+        let scan_plan = dataframe.into_logical_plan();
+        let filter_conditions =
+            PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
+                .context(PrometheusLabelValuesQueryPlanSnafu)?;
+        let logical_plan = promql::label_values::rewrite_label_values_query(
+            table,
+            scan_plan,
+            filter_conditions,
+            label_name,
+            start,
+            end,
+        )
+        .context(PrometheusLabelValuesQueryPlanSnafu)?;
+
+        let results = self
+            .query_engine
+            .execute(logical_plan, ctx.clone())
+            .await
+            .context(ExecLogicalPlanSnafu)?;
+
+        let batches = match results.data {
+            OutputData::Stream(stream) => util::collect(stream)
+                .await
+                .context(CollectRecordbatchSnafu)?,
+            OutputData::RecordBatches(rbs) => rbs.take(),
+            _ => unreachable!("should not happen"),
+        };
+
+        let mut results = Vec::with_capacity(batches.iter().map(|b| b.num_rows()).sum());
+        for batch in batches {
+            // Only one column the results, ensured by `prometheus::label_values_matchers_to_plan`.
+            let names = batch.column(0);
+
+            for i in 0..names.len() {
+                let Value::String(name) = names.get(i) else {
+                    unreachable!();
+                };
+
+                results.push(name.into_string());
+            }
+        }
+
+        Ok(results)
+    }
 }
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -29,6 +29,7 @@ prost.workspace = true
 puffin.workspace = true
 regex.workspace = true
 regex-automata.workspace = true
+roaring = "0.10"
 serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
--- a/src/index/src/bitmap.rs
+++ b/src/index/src/bitmap.rs
@@ -0,0 +1,868 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::io;
+use std::ops::RangeInclusive;
+
+use common_base::BitVec;
+/// `BitmapType` enumerates how bitmaps are encoded within the inverted index.
+pub use greptime_proto::v1::index::BitmapType;
+use roaring::RoaringBitmap;
+
+/// A bitmap representation supporting both BitVec and RoaringBitmap formats.
+///
+/// This enum provides unified bitmap operations while allowing efficient storage
+/// in different formats. The implementation automatically handles type conversions
+/// when performing operations between different formats.
+///
+/// # Examples
+///
+/// Creating a new Roaring bitmap:
+/// ```
+/// use bitmap::Bitmap;
+/// let bitmap = Bitmap::new_roaring();
+/// assert!(bitmap.is_empty());
+/// ```
+///
+/// Creating a full BitVec bitmap:
+/// ```
+/// use bitmap::Bitmap;
+/// let bitmap = Bitmap::full_bitvec(10);
+/// assert_eq!(bitmap.count_ones(), 10);
+/// ```
+#[derive(Debug, Clone, PartialEq)]
+pub enum Bitmap {
+    Roaring(RoaringBitmap),
+    BitVec(BitVec),
+}
+
+impl Bitmap {
+    /// Creates a new empty BitVec-based bitmap.
+    pub fn new_bitvec() -> Self {
+        Bitmap::BitVec(BitVec::EMPTY)
+    }
+
+    /// Creates a new empty RoaringBitmap-based bitmap.
+    pub fn new_roaring() -> Self {
+        Bitmap::Roaring(RoaringBitmap::new())
+    }
+
+    /// Creates a full BitVec-based bitmap with all bits set to 1.
+    ///
+    /// # Arguments
+    /// * `size` - The number of bits to allocate and set
+    pub fn full_bitvec(size: usize) -> Self {
+        Bitmap::BitVec(BitVec::repeat(true, size))
+    }
+
+    /// Creates a full RoaringBitmap-based bitmap with bits 0..size set to 1.
+    ///
+    /// # Arguments
+    /// * `size` - The exclusive upper bound for the bit range
+    pub fn full_roaring(size: usize) -> Self {
+        let mut roaring = RoaringBitmap::new();
+        roaring.insert_range(0..size as u32);
+        Bitmap::Roaring(roaring)
+    }
+
+    /// Returns the number of bits set to 1 in the bitmap.
+    pub fn count_ones(&self) -> usize {
+        match self {
+            Bitmap::BitVec(bitvec) => bitvec.count_ones(),
+            Bitmap::Roaring(roaring) => roaring.len() as _,
+        }
+    }
+
+    /// Checks if the bitmap contains no set bits.
+    pub fn is_empty(&self) -> bool {
+        match self {
+            Bitmap::BitVec(bitvec) => bitvec.is_empty(),
+            Bitmap::Roaring(roaring) => roaring.is_empty(),
+        }
+    }
+
+    /// Inserts a range of bits into the bitmap.
+    ///
+    /// # Arguments
+    /// * `range` - Inclusive range of bits to set
+    pub fn insert_range(&mut self, range: RangeInclusive<usize>) {
+        match self {
+            Bitmap::BitVec(bitvec) => {
+                if *range.end() >= bitvec.len() {
+                    bitvec.resize(range.end() + 1, false);
+                }
+                for i in range {
+                    bitvec.set(i, true);
+                }
+            }
+            Bitmap::Roaring(roaring) => {
+                let range = *range.start() as u32..=*range.end() as u32;
+                roaring.insert_range(range);
+            }
+        }
+    }
+
+    /// Serializes the bitmap into a byte buffer using the specified format.
+    ///
+    /// # Arguments
+    /// * `serialize_type` - Target format for serialization
+    /// * `writer` - Output writer to write the serialized data
+    pub fn serialize_into(
+        &self,
+        serialize_type: BitmapType,
+        mut writer: impl io::Write,
+    ) -> io::Result<()> {
+        match (self, serialize_type) {
+            (Bitmap::BitVec(bitvec), BitmapType::BitVec) => {
+                writer.write_all(bitvec.as_raw_slice())?;
+            }
+            (Bitmap::Roaring(roaring), BitmapType::Roaring) => {
+                roaring.serialize_into(writer)?;
+            }
+            (Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
+                let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
+                bitmap.serialize_into(writer)?;
+            }
+            (Bitmap::Roaring(roaring), BitmapType::BitVec) => {
+                let bitvec = Bitmap::roaring_to_bitvec(roaring);
+                writer.write_all(bitvec.as_raw_slice())?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Computes the size of the serialized bitmap in bytes.
+    ///
+    /// # Arguments
+    /// * `bitmap_type` - Format of data to be serialized
+    pub fn serialized_size(&self, bitmap_type: BitmapType) -> usize {
+        match (self, bitmap_type) {
+            (Bitmap::BitVec(bitvec), BitmapType::BitVec) => bitvec.as_raw_slice().len(),
+            (Bitmap::Roaring(roaring), BitmapType::Roaring) => roaring.serialized_size(),
+            (Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
+                let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
+                bitmap.serialized_size()
+            }
+            (Bitmap::Roaring(roaring), BitmapType::BitVec) => {
+                let bitvec = Bitmap::roaring_to_bitvec(roaring);
+                bitvec.as_raw_slice().len()
+            }
+        }
+    }
+
+    /// Deserializes a bitmap from a byte buffer.
+    ///
+    /// # Arguments
+    /// * `buf` - Input buffer containing serialized data
+    /// * `bitmap_type` - Format of the serialized data
+    pub fn deserialize_from(buf: &[u8], bitmap_type: BitmapType) -> std::io::Result<Self> {
+        match bitmap_type {
+            BitmapType::BitVec => {
+                let bitvec = BitVec::from_slice(buf);
+                Ok(Bitmap::BitVec(bitvec))
+            }
+            BitmapType::Roaring => {
+                let roaring = RoaringBitmap::deserialize_from(buf)?;
+                Ok(Bitmap::Roaring(roaring))
+            }
+        }
+    }
+
+    /// Computes the union with another bitmap (in-place).
+    ///
+    /// If the other bitmap is a different type, it will be converted to match
+    /// the current bitmap's type.
+    pub fn union(&mut self, other: Self) {
+        if self.is_empty() {
+            *self = other;
+            return;
+        }
+
+        match (self, other) {
+            (Bitmap::BitVec(bitvec1), bitmap) => {
+                let bitvec2 = bitmap.into_bitvec();
+                if bitvec1.len() > bitvec2.len() {
+                    *bitvec1 |= bitvec2
+                } else {
+                    *bitvec1 = bitvec2 | &*bitvec1;
+                }
+            }
+            (Bitmap::Roaring(roaring1), bitmap) => {
+                let roaring2 = bitmap.into_roaring();
+                *roaring1 |= roaring2;
+            }
+        }
+    }
+
+    /// Computes the intersection with another bitmap (in-place).
+    ///
+    /// If the other bitmap is a different type, it will be converted to match
+    /// the current bitmap's type.
+    pub fn intersect(&mut self, other: Self) {
+        match (self, other) {
+            (Bitmap::BitVec(bitvec1), bitmap) => {
+                let mut bitvec2 = bitmap.into_bitvec();
+                let len = (bitvec1.len() - bitvec1.trailing_zeros())
+                    .min(bitvec2.len() - bitvec2.trailing_zeros());
+                bitvec1.truncate(len);
+                bitvec2.truncate(len);
+                *bitvec1 &= bitvec2;
+            }
+            (Bitmap::Roaring(roaring1), bitmap) => {
+                let roaring2 = bitmap.into_roaring();
+                *roaring1 &= roaring2;
+            }
+        }
+    }
+
+    /// Returns an iterator over the indices of set bits.
+    pub fn iter_ones(&self) -> Box<dyn Iterator<Item = usize> + '_> {
+        match self {
+            Bitmap::BitVec(bitvec) => Box::new(bitvec.iter_ones()),
+            Bitmap::Roaring(roaring) => Box::new(roaring.iter().map(|x| x as usize)),
+        }
+    }
+
+    /// Creates a bitmap from bytes in LSB0 (least significant bit first) order.
+    ///
+    /// # Arguments
+    /// * `bytes` - Input bytes in LSB0 order
+    /// * `bitmap_type` - Type of bitmap to create
+    pub fn from_lsb0_bytes(bytes: &[u8], bitmap_type: BitmapType) -> Self {
+        match bitmap_type {
+            BitmapType::BitVec => {
+                let bitvec = BitVec::from_slice(bytes);
+                Bitmap::BitVec(bitvec)
+            }
+            BitmapType::Roaring => {
+                let roaring = RoaringBitmap::from_lsb0_bytes(0, bytes);
+                Bitmap::Roaring(roaring)
+            }
+        }
+    }
+
+    /// Computes memory usage of the bitmap in bytes.
+    pub fn memory_usage(&self) -> usize {
+        match self {
+            Bitmap::BitVec(bitvec) => bitvec.capacity(),
+            Bitmap::Roaring(roaring) => {
+                let stat = roaring.statistics();
+                (stat.n_bytes_array_containers
+                    + stat.n_bytes_bitset_containers
+                    + stat.n_bytes_run_containers) as usize
+            }
+        }
+    }
+
+    fn into_bitvec(self) -> BitVec {
+        match self {
+            Bitmap::BitVec(bitvec) => bitvec,
+            Bitmap::Roaring(roaring) => Self::roaring_to_bitvec(&roaring),
+        }
+    }
+
+    fn into_roaring(self) -> RoaringBitmap {
+        match self {
+            Bitmap::Roaring(roaring) => roaring,
+            Bitmap::BitVec(bitvec) => Self::bitvec_to_roaring(bitvec),
+        }
+    }
+
+    fn roaring_to_bitvec(roaring: &RoaringBitmap) -> BitVec {
+        let max_value = roaring.max().unwrap_or(0);
+        let mut bitvec = BitVec::repeat(false, max_value as usize + 1);
+        for i in roaring {
+            bitvec.set(i as usize, true);
+        }
+        bitvec
+    }
+
+    fn bitvec_to_roaring(mut bitvec: BitVec) -> RoaringBitmap {
+        bitvec.resize(bitvec.capacity(), false);
+        RoaringBitmap::from_lsb0_bytes(0, bitvec.as_raw_slice())
+    }
+}
+
+impl Default for Bitmap {
+    fn default() -> Self {
+        Bitmap::new_roaring()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_full_bitmaps() {
+        let bv = Bitmap::full_bitvec(10);
+        assert_eq!(bv.count_ones(), 10);
+
+        let rb = Bitmap::full_roaring(10);
+        assert_eq!(rb.count_ones(), 10);
+    }
+
+    #[test]
+    fn test_serialization_roundtrip() {
+        let original = Bitmap::full_roaring(100);
+        let mut buf = Vec::new();
+
+        // Serialize as Roaring
+        original
+            .serialize_into(BitmapType::Roaring, &mut buf)
+            .unwrap();
+        let deserialized = Bitmap::deserialize_from(&buf, BitmapType::Roaring).unwrap();
+        assert_eq!(original, deserialized);
+
+        // Serialize as BitVec
+        buf.clear();
+        original
+            .serialize_into(BitmapType::BitVec, &mut buf)
+            .unwrap();
+        let deserialized = Bitmap::deserialize_from(&buf, BitmapType::BitVec).unwrap();
+        assert_eq!(original.count_ones(), deserialized.count_ones());
+    }
+
+    #[test]
+    fn test_union_fulls() {
+        // Test BitVec union
+        let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
+        let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
+        bv1.union(bv2);
+        assert_eq!(bv1.count_ones(), 5);
+
+        let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
+        let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
+        bv1.union(bv2);
+        assert_eq!(bv1.count_ones(), 5);
+
+        // Test Roaring union
+        let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
+        let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
+        rb1.union(rb2);
+        assert_eq!(rb1.count_ones(), 5);
+
+        let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
+        let rb2 = Bitmap::full_roaring(3); // 0-2: 111
+        rb1.union(rb2);
+        assert_eq!(rb1.count_ones(), 5);
+
+        // Test cross-type union
+        let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
+        let bv = Bitmap::full_bitvec(3); // 0-2: 111
+        rb.union(bv);
+        assert_eq!(rb.count_ones(), 5);
+
+        let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
+        let rb = Bitmap::full_roaring(3); // 0-2: 111
+        bv.union(rb);
+        assert_eq!(bv.count_ones(), 5);
+
+        let mut rb = Bitmap::full_roaring(3); // 0-2: 111
+        let bv = Bitmap::full_bitvec(5); // 0-4: 11111
+        rb.union(bv);
+        assert_eq!(rb.count_ones(), 5);
+
+        let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
+        let rb = Bitmap::full_roaring(5); // 0-4: 11111
+        bv.union(rb);
+        assert_eq!(bv.count_ones(), 5);
+    }
+
+    #[test]
+    fn test_union_bitvec() {
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
+        bv1.union(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
+        );
+
+        // Test different lengths
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::BitVec);
+        bv1.union(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
+        );
+
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
+        bv1.union(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
+        );
+
+        // Test empty bitmaps
+        let mut bv1 = Bitmap::new_bitvec();
+        let bv2 = Bitmap::new_bitvec();
+        bv1.union(bv2);
+        assert!(bv1.is_empty());
+
+        let mut bv1 = Bitmap::new_bitvec();
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
+        bv1.union(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
+        );
+
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
+        let bv2 = Bitmap::new_bitvec();
+        bv1.union(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
+        );
+
+        // Test empty and full bitmaps
+        let mut bv1 = Bitmap::new_bitvec();
+        let bv2 = Bitmap::full_bitvec(8);
+        bv1.union(bv2);
+        assert_eq!(bv1, Bitmap::full_bitvec(8));
+
+        let mut bv1 = Bitmap::full_bitvec(8);
+        let bv2 = Bitmap::new_bitvec();
+        bv1.union(bv2);
+        assert_eq!(bv1, Bitmap::full_bitvec(8));
+    }
+
+    #[test]
+    fn test_union_roaring() {
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
+        rb1.union(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
+        );
+
+        // Test different lengths
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::Roaring);
+        rb1.union(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
+        );
+
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
+        rb1.union(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
+        );
+
+        // Test empty bitmaps
+        let mut rb1 = Bitmap::new_roaring();
+        let rb2 = Bitmap::new_roaring();
+        rb1.union(rb2);
+        assert!(rb1.is_empty());
+
+        let mut rb1 = Bitmap::new_roaring();
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
+        rb1.union(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
+        );
+
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
+        let rb2 = Bitmap::new_roaring();
+        rb1.union(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
+        );
+
+        // Test empty and full bit
+        let mut rb1 = Bitmap::new_roaring();
+        let rb2 = Bitmap::full_roaring(8);
+        rb1.union(rb2);
+        assert_eq!(rb1, Bitmap::full_roaring(8));
+
+        let mut rb1 = Bitmap::full_roaring(8);
+        let rb2 = Bitmap::new_roaring();
+        rb1.union(rb2);
+        assert_eq!(rb1, Bitmap::full_roaring(8));
+    }
+
+    #[test]
+    fn test_union_mixed() {
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
+        rb.union(bv);
+        assert_eq!(
+            rb,
+            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
+        );
+
+        let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
+        bv.union(rb);
+        assert_eq!(
+            bv,
+            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
+        );
+
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        let bv = Bitmap::full_bitvec(8);
+        rb.union(bv);
+        assert_eq!(rb, Bitmap::full_roaring(8));
+
+        let mut bv = Bitmap::full_bitvec(8);
+        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        bv.union(rb);
+        assert_eq!(bv, Bitmap::full_bitvec(8));
+
+        let mut rb = Bitmap::new_roaring();
+        let bv = Bitmap::full_bitvec(8);
+        rb.union(bv);
+        assert_eq!(rb, Bitmap::full_bitvec(8));
+
+        let mut bv = Bitmap::full_bitvec(8);
+        let rb = Bitmap::new_roaring();
+        bv.union(rb);
+        assert_eq!(bv, Bitmap::full_bitvec(8));
+
+        let mut rb = Bitmap::new_roaring();
+        let bv = Bitmap::new_bitvec();
+        rb.union(bv);
+        assert!(rb.is_empty());
+
+        let mut bv = Bitmap::new_bitvec();
+        let rb = Bitmap::new_roaring();
+        bv.union(rb);
+        assert!(bv.is_empty());
+
+        let mut rb = Bitmap::new_roaring();
+        let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
+        rb.union(bv);
+        assert_eq!(
+            rb,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
+        );
+
+        let mut bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
+        let rb = Bitmap::new_roaring();
+        bv.union(rb);
+        assert_eq!(
+            bv,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
+        );
+
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
+        let bv = Bitmap::new_bitvec();
+        rb.union(bv);
+        assert_eq!(
+            rb,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
+        );
+
+        let mut bv = Bitmap::new_bitvec();
+        let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
+        bv.union(rb);
+        assert_eq!(
+            bv,
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
+        );
+    }
+
+    #[test]
+    fn test_intersect_fulls() {
+        // Test BitVec intersect
+        let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
+        let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
+        bv1.intersect(bv2);
+        assert_eq!(bv1.count_ones(), 3);
+
+        let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
+        let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
+        bv1.intersect(bv2);
+        assert_eq!(bv1.count_ones(), 3);
+
+        // Test Roaring intersect
+        let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
+        let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
+        rb1.intersect(rb2);
+        assert_eq!(rb1.count_ones(), 3);
+
+        let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
+        let rb2 = Bitmap::full_roaring(3); // 0-2: 111
+        rb1.intersect(rb2);
+        assert_eq!(rb1.count_ones(), 3);
+
+        // Test cross-type intersect
+        let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
+        let bv = Bitmap::full_bitvec(3); // 0-2: 111
+        rb.intersect(bv);
+        assert_eq!(rb.count_ones(), 3);
+
+        let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
+        let rb = Bitmap::full_roaring(3); // 0-2: 111
+        bv.intersect(rb);
+        assert_eq!(bv.count_ones(), 3);
+
+        let mut rb = Bitmap::full_roaring(3); // 0-2: 111
+        let bv = Bitmap::full_bitvec(5); // 0-4: 11111
+        rb.intersect(bv);
+        assert_eq!(rb.count_ones(), 3);
+
+        let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
+        let rb = Bitmap::full_roaring(5); // 0-4: 11111
+        bv.intersect(rb);
+        assert_eq!(bv.count_ones(), 3);
+    }
+
+    #[test]
+    fn test_intersect_bitvec() {
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        bv1.intersect(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
+        );
+
+        // Test different lengths
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
+        bv1.intersect(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
+        );
+
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        bv1.intersect(bv2);
+        assert_eq!(
+            bv1,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
+        );
+
+        // Test empty bitmaps
+        let mut bv1 = Bitmap::new_bitvec();
+        let bv2 = Bitmap::new_bitvec();
+        bv1.intersect(bv2);
+        assert!(bv1.is_empty());
+
+        let mut bv1 = Bitmap::new_bitvec();
+        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        bv1.intersect(bv2);
+        assert!(bv1.is_empty());
+
+        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        let bv2 = Bitmap::new_bitvec();
+        bv1.intersect(bv2);
+        assert!(bv1.is_empty());
+
+        // Test empty and full bitmaps
+        let mut bv1 = Bitmap::new_bitvec();
+        let bv2 = Bitmap::full_bitvec(8);
+        bv1.intersect(bv2);
+        assert!(bv1.is_empty());
+
+        let mut bv1 = Bitmap::full_bitvec(8);
+        let bv2 = Bitmap::new_bitvec();
+        bv1.intersect(bv2);
+        assert!(bv1.is_empty());
+    }
+
+    #[test]
+    fn test_intersect_roaring() {
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        rb1.intersect(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
+        );
+
+        // Test different lengths
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
+        rb1.intersect(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
+        );
+
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        rb1.intersect(rb2);
+        assert_eq!(
+            rb1,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
+        );
+
+        // Test empty bitmaps
+        let mut rb1 = Bitmap::new_roaring();
+        let rb2 = Bitmap::new_roaring();
+        rb1.intersect(rb2);
+        assert!(rb1.is_empty());
+
+        let mut rb1 = Bitmap::new_roaring();
+        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        rb1.intersect(rb2);
+        assert!(rb1.is_empty());
+
+        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        let rb2 = Bitmap::new_roaring();
+        rb1.intersect(rb2);
+        assert!(rb1.is_empty());
+
+        // Test empty and full bitmaps
+        let mut rb1 = Bitmap::new_roaring();
+        let rb2 = Bitmap::full_roaring(8);
+        rb1.intersect(rb2);
+        assert!(rb1.is_empty());
+
+        let mut rb1 = Bitmap::full_roaring(8);
+        let rb2 = Bitmap::new_roaring();
+        rb1.intersect(rb2);
+        assert!(rb1.is_empty());
+    }
+
+    #[test]
+    fn test_intersect_mixed() {
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
+        let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        rb.intersect(bv);
+        assert_eq!(
+            rb,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
+        );
+
+        let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
+        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        bv.intersect(rb);
+        assert_eq!(
+            bv,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
+        );
+
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
+        let bv = Bitmap::full_bitvec(8);
+        rb.intersect(bv);
+        assert_eq!(
+            rb,
+            Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring)
+        );
+
+        let mut bv = Bitmap::full_bitvec(8);
+        let rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
+        bv.intersect(rb);
+        assert_eq!(
+            bv,
+            Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec)
+        );
+
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
+        let bv = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
+        rb.intersect(bv);
+        assert_eq!(
+            rb,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
+        );
+
+        let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
+        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        bv.intersect(rb);
+        assert_eq!(
+            bv,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
+        );
+
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
+        let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        rb.intersect(bv);
+        assert_eq!(
+            rb,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
+        );
+
+        let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
+        let rb = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
+        bv.intersect(rb);
+        assert_eq!(
+            bv,
+            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
+        );
+
+        let mut rb = Bitmap::new_roaring();
+        let bv = Bitmap::full_bitvec(8);
+        rb.intersect(bv);
+        assert!(rb.is_empty());
+
+        let mut bv = Bitmap::full_bitvec(8);
+        let rb = Bitmap::new_roaring();
+        bv.intersect(rb);
+        assert!(bv.is_empty());
+
+        let mut bv = Bitmap::new_bitvec();
+        let rb = Bitmap::full_roaring(8);
+        bv.intersect(rb);
+        assert!(bv.is_empty());
+
+        let mut rb = Bitmap::full_roaring(8);
+        let bv = Bitmap::new_bitvec();
+        rb.intersect(bv);
+        assert!(rb.is_empty());
+
+        let mut rb = Bitmap::new_roaring();
+        let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        rb.intersect(bv);
+        assert!(rb.is_empty());
+
+        let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
+        let rb = Bitmap::new_roaring();
+        bv.intersect(rb);
+        assert!(bv.is_empty());
+
+        let mut bv = Bitmap::new_bitvec();
+        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        bv.intersect(rb);
+        assert!(bv.is_empty());
+
+        let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
+        let bv = Bitmap::new_bitvec();
+        rb.intersect(bv);
+        assert!(rb.is_empty());
+    }
+
+    #[test]
+    fn test_insert_range() {
+        let mut bv = Bitmap::new_bitvec();
+        bv.insert_range(0..=5);
+        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
+
+        let mut rb = Bitmap::new_roaring();
+        rb.insert_range(0..=5);
+        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
+
+        let mut bv = Bitmap::new_bitvec();
+        bv.insert_range(10..=10);
+        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
+
+        let mut rb = Bitmap::new_roaring();
+        rb.insert_range(10..=10);
+        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
+    }
+}
--- a/src/index/src/inverted_index/create.rs
+++ b/src/index/src/inverted_index/create.rs
@@ -17,6 +17,7 @@ pub mod sort_create;

 use async_trait::async_trait;

+use crate::bitmap::BitmapType;
 use crate::inverted_index::error::Result;
 use crate::inverted_index::format::writer::InvertedIndexWriter;
 use crate::BytesRef;
@@ -53,5 +54,9 @@ pub trait InvertedIndexCreator: Send {

    /// Finalizes the index creation process, ensuring all data is properly indexed and stored
    /// in the provided writer
-    async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()>;
+    async fn finish(
+        &mut self,
+        writer: &mut dyn InvertedIndexWriter,
+        bitmap_type: BitmapType,
+    ) -> Result<()>;
 }
--- a/src/index/src/inverted_index/create/sort.rs
+++ b/src/index/src/inverted_index/create/sort.rs
@@ -17,22 +17,23 @@ mod intermediate_rw;
 mod merge_stream;

 use async_trait::async_trait;
-use common_base::BitVec;
 use futures::Stream;

+use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
+use crate::inverted_index::format::writer::ValueStream;
 use crate::{Bytes, BytesRef};

 /// A stream of sorted values along with their associated bitmap
-pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;
+pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;

 /// Output of a sorting operation, encapsulating a bitmap for null values and a stream of sorted items
 pub struct SortOutput {
    /// Bitmap indicating which segments have null values
-    pub segment_null_bitmap: BitVec,
+    pub segment_null_bitmap: Bitmap,

    /// Stream of sorted items
-    pub sorted_stream: SortedStream,
+    pub sorted_stream: ValueStream,

    /// Total number of rows in the sorted data
    pub total_row_count: usize,
--- a/src/index/src/inverted_index/create/sort/external_sort.rs
+++ b/src/index/src/inverted_index/create/sort/external_sort.rs
@@ -20,11 +20,11 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;

 use async_trait::async_trait;
-use common_base::BitVec;
 use common_telemetry::{debug, error};
 use futures::stream;
 use snafu::ResultExt;

+use crate::bitmap::Bitmap;
 use crate::external_provider::ExternalTempFileProvider;
 use crate::inverted_index::create::sort::intermediate_rw::{
    IntermediateReader, IntermediateWriter,
@@ -45,18 +45,10 @@ pub struct ExternalSorter {
    temp_file_provider: Arc<dyn ExternalTempFileProvider>,

    /// Bitmap indicating which segments have null values
-    segment_null_bitmap: BitVec,
+    segment_null_bitmap: Bitmap,

    /// In-memory buffer to hold values and their corresponding bitmaps until memory threshold is exceeded
-    values_buffer: BTreeMap<Bytes, BitVec>,
-
-    /// Count of rows in the last dumped buffer, used to streamline memory usage of `values_buffer`.
-    ///
-    /// After data is dumped to external files, `last_dump_row_count` is updated to reflect the new starting point
-    /// for `BitVec` indexing. This means each `BitVec` in `values_buffer` thereafter encodes positions relative to
-    /// this count, not from 0. This mechanism effectively shrinks the memory footprint of each `BitVec`, helping manage
-    /// memory use more efficiently by focusing only on newly ingested data post-dump.
-    last_dump_row_count: usize,
+    values_buffer: BTreeMap<Bytes, (Bitmap, usize)>,

    /// Count of all rows ingested so far
    total_row_count: usize,
@@ -93,14 +85,14 @@ impl Sorter for ExternalSorter {
            return Ok(());
        }

-        let segment_index_range = self.segment_index_range(n, value.is_none());
+        let segment_index_range = self.segment_index_range(n);
        self.total_row_count += n;

        if let Some(value) = value {
            let memory_diff = self.push_not_null(value, segment_index_range);
            self.may_dump_buffer(memory_diff).await
        } else {
-            set_bits(&mut self.segment_null_bitmap, segment_index_range);
+            self.segment_null_bitmap.insert_range(segment_index_range);
            Ok(())
        }
    }
@@ -117,15 +109,10 @@ impl Sorter for ExternalSorter {
        // TODO(zhongzc): k-way merge instead of 2-way merge

        let mut tree_nodes: VecDeque<SortedStream> = VecDeque::with_capacity(readers.len() + 1);
-        let leading_zeros = self.last_dump_row_count / self.segment_row_count;
        tree_nodes.push_back(Box::new(stream::iter(
            mem::take(&mut self.values_buffer)
                .into_iter()
-                .map(move |(value, mut bitmap)| {
-                    bitmap.resize(bitmap.len() + leading_zeros, false);
-                    bitmap.shift_right(leading_zeros);
-                    Ok((value, bitmap))
-                }),
+                .map(|(value, (bitmap, _))| Ok((value, bitmap))),
        )));
        for (_, reader) in readers {
            tree_nodes.push_back(IntermediateReader::new(reader).into_stream().await?);
@@ -161,11 +148,10 @@ impl ExternalSorter {
            index_name,
            temp_file_provider,

-            segment_null_bitmap: BitVec::new(),
+            segment_null_bitmap: Bitmap::new_bitvec(), // bitvec is more efficient for many null values
            values_buffer: BTreeMap::new(),

            total_row_count: 0,
-            last_dump_row_count: 0,
            segment_row_count,

            current_memory_usage: 0,
@@ -195,7 +181,7 @@ impl ExternalSorter {
    }

    /// Pushes the non-null values to the values buffer and sets the bits within
-    /// the specified range in the given BitVec to true.
+    /// the specified range in the given bitmap to true.
    /// Returns the memory usage difference of the buffer after the operation.
    fn push_not_null(
        &mut self,
@@ -203,20 +189,23 @@ impl ExternalSorter {
        segment_index_range: RangeInclusive<usize>,
    ) -> usize {
        match self.values_buffer.get_mut(value) {
-            Some(bitmap) => {
-                let old_len = bitmap.as_raw_slice().len();
-                set_bits(bitmap, segment_index_range);
+            Some((bitmap, mem_usage)) => {
+                bitmap.insert_range(segment_index_range);
+                let new_usage = bitmap.memory_usage() + value.len();
+                let diff = new_usage - *mem_usage;
+                *mem_usage = new_usage;

-                bitmap.as_raw_slice().len() - old_len
+                diff
            }
            None => {
-                let mut bitmap = BitVec::default();
-                set_bits(&mut bitmap, segment_index_range);
+                let mut bitmap = Bitmap::new_roaring();
+                bitmap.insert_range(segment_index_range);

-                let mem_diff = bitmap.as_raw_slice().len() + value.len();
-                self.values_buffer.insert(value.to_vec(), bitmap);
+                let mem_usage = bitmap.memory_usage() + value.len();
+                self.values_buffer
+                    .insert(value.to_vec(), (bitmap, mem_usage));

-                mem_diff
+                mem_usage
            }
        }
    }
@@ -257,12 +246,8 @@ impl ExternalSorter {
            .fetch_sub(memory_usage, Ordering::Relaxed);
        self.current_memory_usage = 0;

-        let bitmap_leading_zeros = self.last_dump_row_count / self.segment_row_count;
-        self.last_dump_row_count =
-            self.total_row_count - self.total_row_count % self.segment_row_count; // align to segment
-
        let entries = values.len();
-        IntermediateWriter::new(writer).write_all(values, bitmap_leading_zeros as _).await.inspect(|_|
+        IntermediateWriter::new(writer).write_all(values.into_iter().map(|(k, (b, _))| (k, b))).await.inspect(|_|
            debug!("Dumped {entries} entries ({memory_usage} bytes) to intermediate file {file_id} for index {index_name}")
        ).inspect_err(|e|
            error!(e; "Failed to dump {entries} entries to intermediate file {file_id} for index {index_name}")
@@ -271,13 +256,8 @@ impl ExternalSorter {

    /// Determines the segment index range for the row index range
    /// `[row_begin, row_begin + n - 1]`
-    fn segment_index_range(&self, n: usize, is_null: bool) -> RangeInclusive<usize> {
-        let row_begin = if is_null {
-            self.total_row_count
-        } else {
-            self.total_row_count - self.last_dump_row_count
-        };
-
+    fn segment_index_range(&self, n: usize) -> RangeInclusive<usize> {
+        let row_begin = self.total_row_count;
        let start = self.segment_index(row_begin);
        let end = self.segment_index(row_begin + n - 1);
        start..=end
@@ -289,16 +269,6 @@ impl ExternalSorter {
    }
 }

-/// Sets the bits within the specified range in the given `BitVec` to true
-fn set_bits(bitmap: &mut BitVec, index_range: RangeInclusive<usize>) {
-    if *index_range.end() >= bitmap.len() {
-        bitmap.resize(index_range.end() + 1, false);
-    }
-    for index in index_range {
-        bitmap.set(index, true);
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::collections::HashMap;
@@ -330,7 +300,7 @@ mod tests {
            move |index_name, file_id| {
                assert_eq!(index_name, "test");
                let mut files = files.lock().unwrap();
-                let (writer, reader) = duplex(8 * 1024);
+                let (writer, reader) = duplex(1024 * 1024);
                files.insert(file_id.to_string(), Box::new(reader.compat()));
                Ok(Box::new(writer.compat_write()))
            }
--- a/src/index/src/inverted_index/create/sort/intermediate_rw.rs
+++ b/src/index/src/inverted_index/create/sort/intermediate_rw.rs
@@ -19,29 +19,24 @@
 //! The serialization format is as follows:
 //!
 //! ```text
-//! [magic][bitmap leading zeros][item][item]...[item]
-//!    [4]          [4]              [?]
+//! [magic][item][item]...[item]
+//!    [4]       [?]
 //!
 //! Each [item] is structured as:
 //! [value len][value][bitmap len][bitmap]
 //!     [8]       [?]       [8]        [?]
 //! ```
 //!
-//! The format starts with a 4-byte magic identifier, followed by a 4-byte
-//! bitmap leading zeros count, indicating how many leading zeros are in the
-//! fixed-size region of the bitmap. Following that, each item represents
-//! a value and its associated bitmap, serialized with their lengths for
+//! Each item represents a value and its associated bitmap, serialized with their lengths for
 //! easier deserialization.

 mod codec_v1;

-use std::collections::BTreeMap;
-
 use asynchronous_codec::{FramedRead, FramedWrite};
-use common_base::BitVec;
 use futures::{stream, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, StreamExt};
 use snafu::ResultExt;

+use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::create::sort::SortedStream;
 use crate::inverted_index::error::{
    CloseSnafu, FlushSnafu, ReadSnafu, Result, UnknownIntermediateCodecMagicSnafu, WriteSnafu,
@@ -62,12 +57,13 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
    /// Serializes and writes all provided values to the wrapped writer
    pub async fn write_all(
        mut self,
-        values: BTreeMap<Bytes, BitVec>,
-        bitmap_leading_zeros: u32,
+        values: impl IntoIterator<Item = (Bytes, Bitmap)>,
    ) -> Result<()> {
        let (codec_magic, encoder) = (
            codec_v1::CODEC_V1_MAGIC,
-            codec_v1::IntermediateItemEncoderV1,
+            codec_v1::IntermediateItemEncoderV1 {
+                bitmap_type: BitmapType::Roaring,
+            },
        );

        self.writer
@@ -75,11 +71,6 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
            .await
            .context(WriteSnafu)?;

-        self.writer
-            .write_all(&bitmap_leading_zeros.to_be_bytes())
-            .await
-            .context(WriteSnafu)?;
-
        let value_stream = stream::iter(values.into_iter().map(Ok));
        let frame_write = FramedWrite::new(&mut self.writer, encoder);
        // `forward()` will flush and close the writer when the stream ends
@@ -112,17 +103,9 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {
            .context(ReadSnafu)?;

        let decoder = match &magic {
-            codec_v1::CODEC_V1_MAGIC => {
-                let bitmap_leading_zeros = {
-                    let mut buf = [0u8; 4];
-                    self.reader.read_exact(&mut buf).await.context(ReadSnafu)?;
-                    u32::from_be_bytes(buf)
-                };
-
-                codec_v1::IntermediateItemDecoderV1 {
-                    bitmap_leading_zeros,
-                }
-            }
+            codec_v1::CODEC_V1_MAGIC => codec_v1::IntermediateItemDecoderV1 {
+                bitmap_type: BitmapType::Roaring,
+            },
            _ => return UnknownIntermediateCodecMagicSnafu { magic }.fail(),
        };

@@ -132,6 +115,7 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {

 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeMap;
    use std::io::{Seek, SeekFrom};

    use futures::io::{AllowStdIo, Cursor};
@@ -140,6 +124,10 @@ mod tests {
    use super::*;
    use crate::inverted_index::error::Error;

+    fn bitmap(bytes: &[u8]) -> Bitmap {
+        Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
+    }
+
    #[tokio::test]
    async fn test_intermediate_read_write_basic() {
        let file_r = tempfile().unwrap();
@@ -148,12 +136,12 @@ mod tests {
        let buf_w = AllowStdIo::new(file_w);

        let values = BTreeMap::from_iter([
-            (Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
-            (Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
+            (Bytes::from("a"), bitmap(&[0b10101010])),
+            (Bytes::from("b"), bitmap(&[0b01010101])),
        ]);

        let writer = IntermediateWriter::new(buf_w);
-        writer.write_all(values.clone(), 0).await.unwrap();
+        writer.write_all(values.clone()).await.unwrap();
        // reset the handle
        buf_r.seek(SeekFrom::Start(0)).unwrap();

@@ -161,48 +149,9 @@ mod tests {
        let mut stream = reader.into_stream().await.unwrap();

        let a = stream.next().await.unwrap().unwrap();
-        assert_eq!(a, (Bytes::from("a"), BitVec::from_slice(&[0b10101010])));
+        assert_eq!(a, (Bytes::from("a"), bitmap(&[0b10101010])));
        let b = stream.next().await.unwrap().unwrap();
-        assert_eq!(b, (Bytes::from("b"), BitVec::from_slice(&[0b01010101])));
-        assert!(stream.next().await.is_none());
-    }
-
-    #[tokio::test]
-    async fn test_intermediate_read_write_with_prefix_zeros() {
-        let file_r = tempfile().unwrap();
-        let file_w = file_r.try_clone().unwrap();
-        let mut buf_r = AllowStdIo::new(file_r);
-        let buf_w = AllowStdIo::new(file_w);
-
-        let values = BTreeMap::from_iter([
-            (Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
-            (Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
-        ]);
-
-        let writer = IntermediateWriter::new(buf_w);
-        writer.write_all(values.clone(), 8).await.unwrap();
-        // reset the handle
-        buf_r.seek(SeekFrom::Start(0)).unwrap();
-
-        let reader = IntermediateReader::new(buf_r);
-        let mut stream = reader.into_stream().await.unwrap();
-
-        let a = stream.next().await.unwrap().unwrap();
-        assert_eq!(
-            a,
-            (
-                Bytes::from("a"),
-                BitVec::from_slice(&[0b00000000, 0b10101010])
-            )
-        );
-        let b = stream.next().await.unwrap().unwrap();
-        assert_eq!(
-            b,
-            (
-                Bytes::from("b"),
-                BitVec::from_slice(&[0b00000000, 0b01010101])
-            )
-        );
+        assert_eq!(b, (Bytes::from("b"), bitmap(&[0b01010101])));
        assert!(stream.next().await.is_none());
    }

@@ -213,7 +162,7 @@ mod tests {
        let values = BTreeMap::new();

        let writer = IntermediateWriter::new(&mut buf);
-        writer.write_all(values.clone(), 0).await.unwrap();
+        writer.write_all(values.clone()).await.unwrap();

        let reader = IntermediateReader::new(Cursor::new(buf));
        let mut stream = reader.into_stream().await.unwrap();
--- a/src/index/src/inverted_index/create/sort/intermediate_rw/codec_v1.rs
+++ b/src/index/src/inverted_index/create/sort/intermediate_rw/codec_v1.rs
@@ -16,9 +16,10 @@ use std::io;

 use asynchronous_codec::{BytesMut, Decoder, Encoder};
 use bytes::{Buf, BufMut};
-use common_base::BitVec;
+use greptime_proto::v1::index::BitmapType;
 use snafu::ResultExt;

+use crate::bitmap::Bitmap;
 use crate::inverted_index::error::{CommonIoSnafu, Error, Result};
 use crate::Bytes;

@@ -28,37 +29,42 @@ const U64_LENGTH: usize = std::mem::size_of::<u64>();
 pub const CODEC_V1_MAGIC: &[u8; 4] = b"im01";

 /// Serializes items of external sorting intermediate files.
-pub struct IntermediateItemEncoderV1;
+pub struct IntermediateItemEncoderV1 {
+    pub bitmap_type: BitmapType,
+}

 /// [`FramedWrite`] requires the [`Encoder`] trait to be implemented.
 impl Encoder for IntermediateItemEncoderV1 {
-    type Item<'a> = (Bytes, BitVec);
+    type Item<'a> = (Bytes, Bitmap);
    type Error = Error;

-    fn encode(&mut self, item: (Bytes, BitVec), dst: &mut BytesMut) -> Result<()> {
+    fn encode(&mut self, item: (Bytes, Bitmap), dst: &mut BytesMut) -> Result<()> {
        let value_bytes = item.0;
-        let bitmap_bytes = item.1.into_vec();
+        let bitmap_size = item.1.serialized_size(self.bitmap_type);

-        dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_bytes.len());
+        dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_size);
        dst.put_u64_le(value_bytes.len() as u64);
        dst.extend_from_slice(&value_bytes);
-        dst.put_u64_le(bitmap_bytes.len() as u64);
-        dst.extend_from_slice(&bitmap_bytes);
+        dst.put_u64_le(bitmap_size as u64);
+        item.1
+            .serialize_into(self.bitmap_type, &mut dst.writer())
+            .context(CommonIoSnafu)?;
+
        Ok(())
    }
 }

 /// Deserializes items of external sorting intermediate files.
 pub struct IntermediateItemDecoderV1 {
-    pub(crate) bitmap_leading_zeros: u32,
+    pub bitmap_type: BitmapType,
 }

 /// [`FramedRead`] requires the [`Decoder`] trait to be implemented.
 impl Decoder for IntermediateItemDecoderV1 {
-    type Item = (Bytes, BitVec);
+    type Item = (Bytes, Bitmap);
    type Error = Error;

-    /// Decodes the `src` into `(Bytes, BitVec)`. Returns `None` if
+    /// Decodes the `src` into `(Bytes, RoaringBitmap)`. Returns `None` if
    /// the `src` does not contain enough data for a complete item.
    ///
    /// Only after successful decoding, the `src` is advanced. Otherwise,
@@ -92,8 +98,8 @@ impl Decoder for IntermediateItemDecoderV1 {
            return Ok(None);
        }

-        let mut bitmap = BitVec::repeat(false, self.bitmap_leading_zeros as _);
-        bitmap.extend_from_raw_slice(&buf[..bitmap_len]);
+        let bitmap = Bitmap::deserialize_from(&buf[..bitmap_len], self.bitmap_type)
+            .context(CommonIoSnafu)?;

        let item = (value_bytes.to_vec(), bitmap);

@@ -113,25 +119,29 @@ impl From<io::Error> for Error {

 #[cfg(test)]
 mod tests {
-    use common_base::bit_vec::prelude::{bitvec, Lsb0};
-
    use super::*;

+    fn bitmap(bytes: &[u8]) -> Bitmap {
+        Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
+    }
+
    #[test]
    fn test_intermediate_codec_basic() {
-        let mut encoder = IntermediateItemEncoderV1;
+        let mut encoder = IntermediateItemEncoderV1 {
+            bitmap_type: BitmapType::Roaring,
+        };
        let mut buf = BytesMut::new();

-        let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
+        let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
        encoder.encode(item.clone(), &mut buf).unwrap();

        let mut decoder = IntermediateItemDecoderV1 {
-            bitmap_leading_zeros: 0,
+            bitmap_type: BitmapType::Roaring,
        };
        assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
        assert_eq!(decoder.decode(&mut buf).unwrap(), None);

-        let item1 = (b"world".to_vec(), BitVec::from_slice(&[0b01010101]));
+        let item1 = (b"world".to_vec(), bitmap(&[0b01010101]));
        encoder.encode(item.clone(), &mut buf).unwrap();
        encoder.encode(item1.clone(), &mut buf).unwrap();
        assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
@@ -142,14 +152,16 @@ mod tests {

    #[test]
    fn test_intermediate_codec_empty_item() {
-        let mut encoder = IntermediateItemEncoderV1;
+        let mut encoder = IntermediateItemEncoderV1 {
+            bitmap_type: BitmapType::Roaring,
+        };
        let mut buf = BytesMut::new();

-        let item = (b"".to_vec(), BitVec::from_slice(&[]));
+        let item = (b"".to_vec(), bitmap(&[]));
        encoder.encode(item.clone(), &mut buf).unwrap();

        let mut decoder = IntermediateItemDecoderV1 {
-            bitmap_leading_zeros: 0,
+            bitmap_type: BitmapType::Roaring,
        };
        assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
        assert_eq!(decoder.decode(&mut buf).unwrap(), None);
@@ -158,17 +170,19 @@ mod tests {

    #[test]
    fn test_intermediate_codec_partial() {
-        let mut encoder = IntermediateItemEncoderV1;
+        let mut encoder = IntermediateItemEncoderV1 {
+            bitmap_type: BitmapType::Roaring,
+        };
        let mut buf = BytesMut::new();

-        let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
+        let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
        encoder.encode(item.clone(), &mut buf).unwrap();

        let partial_length = U64_LENGTH + 3;
        let mut partial_bytes = buf.split_to(partial_length);

        let mut decoder = IntermediateItemDecoderV1 {
-            bitmap_leading_zeros: 0,
+            bitmap_type: BitmapType::Roaring,
        };
        assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None); // not enough data
        partial_bytes.extend_from_slice(&buf[..]);
@@ -176,25 +190,4 @@ mod tests {
        assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None);
        assert!(partial_bytes.is_empty());
    }
-
-    #[test]
-    fn test_intermediate_codec_prefix_zeros() {
-        let mut encoder = IntermediateItemEncoderV1;
-        let mut buf = BytesMut::new();
-
-        let item = (b"hello".to_vec(), bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]);
-        encoder.encode(item.clone(), &mut buf).unwrap();
-
-        let mut decoder = IntermediateItemDecoderV1 {
-            bitmap_leading_zeros: 3,
-        };
-        let decoded_item = decoder.decode(&mut buf).unwrap().unwrap();
-        assert_eq!(decoded_item.0, b"hello");
-        assert_eq!(
-            decoded_item.1,
-            bitvec![u8, Lsb0; 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0]
-        );
-        assert_eq!(decoder.decode(&mut buf).unwrap(), None);
-        assert!(buf.is_empty());
-    }
 }
--- a/src/index/src/inverted_index/create/sort/merge_stream.rs
+++ b/src/index/src/inverted_index/create/sort/merge_stream.rs
@@ -16,10 +16,10 @@ use std::cmp::Ordering;
 use std::pin::Pin;
 use std::task::{Context, Poll};

-use common_base::BitVec;
 use futures::{ready, Stream, StreamExt};
 use pin_project::pin_project;

+use crate::bitmap::Bitmap;
 use crate::inverted_index::create::sort::SortedStream;
 use crate::inverted_index::error::Result;
 use crate::Bytes;
@@ -28,10 +28,10 @@ use crate::Bytes;
 #[pin_project]
 pub struct MergeSortedStream {
    stream1: Option<SortedStream>,
-    peek1: Option<(Bytes, BitVec)>,
+    peek1: Option<(Bytes, Bitmap)>,

    stream2: Option<SortedStream>,
-    peek2: Option<(Bytes, BitVec)>,
+    peek2: Option<(Bytes, Bitmap)>,
 }

 impl MergeSortedStream {
@@ -49,7 +49,7 @@ impl MergeSortedStream {
 }

 impl Stream for MergeSortedStream {
-    type Item = Result<(Bytes, BitVec)>;
+    type Item = Result<(Bytes, Bitmap)>;

    /// Polls both streams and returns the next item from the stream that has the smaller next item.
    /// If both streams have the same next item, the bitmaps are unioned together.
@@ -89,77 +89,77 @@ impl Stream for MergeSortedStream {
 }

 /// Merges two bitmaps by bit-wise OR'ing them together, preserving all bits from both
-fn merge_bitmaps(bitmap1: BitVec, bitmap2: BitVec) -> BitVec {
-    // make sure longer bitmap is on the left to avoid truncation
-    #[allow(clippy::if_same_then_else)]
-    if bitmap1.len() > bitmap2.len() {
-        bitmap1 | bitmap2
-    } else {
-        bitmap2 | bitmap1
-    }
+fn merge_bitmaps(mut bitmap1: Bitmap, bitmap2: Bitmap) -> Bitmap {
+    bitmap1.union(bitmap2);
+    bitmap1
 }

 #[cfg(test)]
 mod tests {
    use futures::stream;
+    use greptime_proto::v1::index::BitmapType;

    use super::*;
    use crate::inverted_index::error::Error;

-    fn sorted_stream_from_vec(vec: Vec<(Bytes, BitVec)>) -> SortedStream {
+    fn bitmap(bytes: &[u8]) -> Bitmap {
+        Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
+    }
+
+    fn sorted_stream_from_vec(vec: Vec<(Bytes, Bitmap)>) -> SortedStream {
        Box::new(stream::iter(vec.into_iter().map(Ok::<_, Error>)))
    }

    #[tokio::test]
    async fn test_merge_sorted_stream_non_overlapping() {
        let stream1 = sorted_stream_from_vec(vec![
-            (Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
-            (Bytes::from("orange"), BitVec::from_slice(&[0b01010101])),
+            (Bytes::from("apple"), bitmap(&[0b10101010])),
+            (Bytes::from("orange"), bitmap(&[0b01010101])),
        ]);
        let stream2 = sorted_stream_from_vec(vec![
-            (Bytes::from("banana"), BitVec::from_slice(&[0b10101010])),
-            (Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
+            (Bytes::from("banana"), bitmap(&[0b10101010])),
+            (Bytes::from("peach"), bitmap(&[0b01010101])),
        ]);

        let mut merged_stream = MergeSortedStream::merge(stream1, stream2);

        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("apple"));
-        assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
+        assert_eq!(item.1, bitmap(&[0b10101010]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("banana"));
-        assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
+        assert_eq!(item.1, bitmap(&[0b10101010]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("orange"));
-        assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
+        assert_eq!(item.1, bitmap(&[0b01010101]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("peach"));
-        assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
+        assert_eq!(item.1, bitmap(&[0b01010101]));
        assert!(merged_stream.next().await.is_none());
    }

    #[tokio::test]
    async fn test_merge_sorted_stream_overlapping() {
        let stream1 = sorted_stream_from_vec(vec![
-            (Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
-            (Bytes::from("orange"), BitVec::from_slice(&[0b10101010])),
+            (Bytes::from("apple"), bitmap(&[0b10101010])),
+            (Bytes::from("orange"), bitmap(&[0b10101010])),
        ]);
        let stream2 = sorted_stream_from_vec(vec![
-            (Bytes::from("apple"), BitVec::from_slice(&[0b01010101])),
-            (Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
+            (Bytes::from("apple"), bitmap(&[0b01010101])),
+            (Bytes::from("peach"), bitmap(&[0b01010101])),
        ]);

        let mut merged_stream = MergeSortedStream::merge(stream1, stream2);

        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("apple"));
-        assert_eq!(item.1, BitVec::from_slice(&[0b11111111]));
+        assert_eq!(item.1, bitmap(&[0b11111111]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("orange"));
-        assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
+        assert_eq!(item.1, bitmap(&[0b10101010]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("peach"));
-        assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
+        assert_eq!(item.1, bitmap(&[0b01010101]));
        assert!(merged_stream.next().await.is_none());
    }

--- a/src/index/src/inverted_index/create/sort_create.rs
+++ b/src/index/src/inverted_index/create/sort_create.rs
@@ -18,6 +18,7 @@ use std::num::NonZeroUsize;
 use async_trait::async_trait;
 use snafu::ensure;

+use crate::bitmap::BitmapType;
 use crate::inverted_index::create::sort::{SortOutput, Sorter};
 use crate::inverted_index::create::InvertedIndexCreator;
 use crate::inverted_index::error::{InconsistentRowCountSnafu, Result};
@@ -68,7 +69,11 @@ impl InvertedIndexCreator for SortIndexCreator {
    }

    /// Finalizes the sorting for all indexes and writes them using the inverted index writer
-    async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()> {
+    async fn finish(
+        &mut self,
+        writer: &mut dyn InvertedIndexWriter,
+        bitmap_type: BitmapType,
+    ) -> Result<()> {
        let mut output_row_count = None;
        for (index_name, mut sorter) in self.sorters.drain() {
            let SortOutput {
@@ -88,7 +93,7 @@ impl InvertedIndexCreator for SortIndexCreator {
            );

            writer
-                .add_index(index_name, segment_null_bitmap, sorted_stream)
+                .add_index(index_name, segment_null_bitmap, sorted_stream, bitmap_type)
                .await?;
        }

@@ -117,9 +122,9 @@ mod tests {
    use futures::{stream, StreamExt};

    use super::*;
-    use crate::inverted_index::create::sort::SortedStream;
+    use crate::bitmap::Bitmap;
    use crate::inverted_index::error::Error;
-    use crate::inverted_index::format::writer::MockInvertedIndexWriter;
+    use crate::inverted_index::format::writer::{MockInvertedIndexWriter, ValueStream};
    use crate::Bytes;

    #[tokio::test]
@@ -143,11 +148,10 @@ mod tests {
        }

        let mut mock_writer = MockInvertedIndexWriter::new();
-        mock_writer
-            .expect_add_index()
-            .times(3)
-            .returning(|name, null_bitmap, stream| {
+        mock_writer.expect_add_index().times(3).returning(
+            |name, null_bitmap, stream, bitmap_type| {
                assert!(null_bitmap.is_empty());
+                assert_eq!(bitmap_type, BitmapType::Roaring);
                match name.as_str() {
                    "a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
                    "b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
@@ -155,7 +159,8 @@ mod tests {
                    _ => panic!("unexpected index name: {}", name),
                }
                Ok(())
-            });
+            },
+        );
        mock_writer
            .expect_finish()
            .times(1)
@@ -165,7 +170,10 @@ mod tests {
                Ok(())
            });

-        creator.finish(&mut mock_writer).await.unwrap();
+        creator
+            .finish(&mut mock_writer, BitmapType::Roaring)
+            .await
+            .unwrap();
    }

    #[tokio::test]
@@ -191,8 +199,9 @@ mod tests {
        let mut mock_writer = MockInvertedIndexWriter::new();
        mock_writer
            .expect_add_index()
-            .returning(|name, null_bitmap, stream| {
+            .returning(|name, null_bitmap, stream, bitmap_type| {
                assert!(null_bitmap.is_empty());
+                assert_eq!(bitmap_type, BitmapType::Roaring);
                match name.as_str() {
                    "a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
                    "b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
@@ -203,7 +212,7 @@ mod tests {
            });
        mock_writer.expect_finish().never();

-        let res = creator.finish(&mut mock_writer).await;
+        let res = creator.finish(&mut mock_writer, BitmapType::Roaring).await;
        assert!(matches!(res, Err(Error::InconsistentRowCount { .. })));
    }

@@ -219,8 +228,9 @@ mod tests {
        let mut mock_writer = MockInvertedIndexWriter::new();
        mock_writer
            .expect_add_index()
-            .returning(|name, null_bitmap, stream| {
+            .returning(|name, null_bitmap, stream, bitmap_type| {
                assert!(null_bitmap.is_empty());
+                assert_eq!(bitmap_type, BitmapType::Roaring);
                assert!(matches!(name.as_str(), "a" | "b" | "c"));
                assert!(stream_to_values(stream).is_empty());
                Ok(())
@@ -234,7 +244,10 @@ mod tests {
                Ok(())
            });

-        creator.finish(&mut mock_writer).await.unwrap();
+        creator
+            .finish(&mut mock_writer, BitmapType::Roaring)
+            .await
+            .unwrap();
    }

    fn set_bit(bit_vec: &mut BitVec, index: usize) {
@@ -283,20 +296,21 @@ mod tests {

        async fn output(&mut self) -> Result<SortOutput> {
            let segment_null_bitmap = self.values.remove(&None).unwrap_or_default();
+            let segment_null_bitmap = Bitmap::BitVec(segment_null_bitmap);

            Ok(SortOutput {
                segment_null_bitmap,
                sorted_stream: Box::new(stream::iter(
                    std::mem::take(&mut self.values)
                        .into_iter()
-                        .map(|(v, b)| Ok((v.unwrap(), b))),
+                        .map(|(v, b)| Ok((v.unwrap(), Bitmap::BitVec(b)))),
                )),
                total_row_count: self.total_row_count,
            })
        }
    }

-    fn stream_to_values(stream: SortedStream) -> Vec<Bytes> {
+    fn stream_to_values(stream: ValueStream) -> Vec<Bytes> {
        futures::executor::block_on(async {
            stream.map(|r| r.unwrap().0).collect::<Vec<Bytes>>().await
        })
--- a/src/index/src/inverted_index/error.rs
+++ b/src/index/src/inverted_index/error.rs
@@ -110,6 +110,14 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Failed to decode bitmap"))]
+    DecodeBitmap {
+        #[snafu(source)]
+        error: IoError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Failed to decode protobuf"))]
    DecodeProto {
        #[snafu(source)]
@@ -240,6 +248,7 @@ impl ErrorExt for Error {
            | CommonIo { .. }
            | UnknownIntermediateCodecMagic { .. }
            | FstCompile { .. }
+            | DecodeBitmap { .. }
            | InvalidFooterPayloadSize { .. }
            | BlobSizeTooSmall { .. } => StatusCode::Unexpected,

--- a/src/index/src/inverted_index/format/reader.rs
+++ b/src/index/src/inverted_index/format/reader.rs
@@ -18,11 +18,11 @@ use std::sync::Arc;

 use async_trait::async_trait;
 use bytes::Bytes;
-use common_base::BitVec;
 use greptime_proto::v1::index::InvertedIndexMetas;
 use snafu::ResultExt;

-use crate::inverted_index::error::{DecodeFstSnafu, Result};
+use crate::bitmap::{Bitmap, BitmapType};
+use crate::inverted_index::error::{DecodeBitmapSnafu, DecodeFstSnafu, Result};
 pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
 use crate::inverted_index::FstMap;

@@ -67,17 +67,25 @@ pub trait InvertedIndexReader: Send + Sync {
    }

    /// Retrieves the bitmap from the given offset and size.
-    async fn bitmap(&self, offset: u64, size: u32) -> Result<BitVec> {
-        self.range_read(offset, size).await.map(BitVec::from_vec)
+    async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
+        self.range_read(offset, size).await.and_then(|bytes| {
+            Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
+        })
    }

    /// Retrieves the multiple bitmaps from the given ranges.
-    async fn bitmap_deque(&mut self, ranges: &[Range<u64>]) -> Result<VecDeque<BitVec>> {
-        Ok(self
-            .read_vec(ranges)
-            .await?
+    async fn bitmap_deque(
+        &mut self,
+        ranges: &[(Range<u64>, BitmapType)],
+    ) -> Result<VecDeque<Bitmap>> {
+        let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
+        let bytes = self.read_vec(&ranges).await?;
+        bytes
            .into_iter()
-            .map(|bytes| BitVec::from_slice(bytes.as_ref()))
-            .collect::<VecDeque<_>>())
+            .zip(types)
+            .map(|(bytes, bitmap_type)| {
+                Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
+            })
+            .collect::<Result<VecDeque<_>>>()
    }
 }
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -78,14 +78,14 @@ impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {

 #[cfg(test)]
 mod tests {
-    use common_base::bit_vec::prelude::*;
    use fst::MapBuilder;
-    use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
+    use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta, InvertedIndexMetas};
    use prost::Message;

    use super::*;
+    use crate::bitmap::Bitmap;

-    fn create_fake_fst() -> Vec<u8> {
+    fn mock_fst() -> Vec<u8> {
        let mut fst_buf = Vec::new();
        let mut build = MapBuilder::new(&mut fst_buf).unwrap();
        build.insert("key1".as_bytes(), 1).unwrap();
@@ -94,19 +94,27 @@ mod tests {
        fst_buf
    }

-    fn create_fake_bitmap() -> Vec<u8> {
-        bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0, 1, 0].into_vec()
+    fn mock_bitmap() -> Bitmap {
+        Bitmap::from_lsb0_bytes(&[0b10101010, 0b10000000], BitmapType::Roaring)
+    }
+
+    fn mock_bitmap_bytes() -> Vec<u8> {
+        let mut buf = Vec::new();
+        mock_bitmap()
+            .serialize_into(BitmapType::Roaring, &mut buf)
+            .unwrap();
+        buf
    }

    fn create_inverted_index_blob() -> Vec<u8> {
-        let bitmap_size = create_fake_bitmap().len();
-        let fst_size = create_fake_fst().len();
+        let bitmap_size = mock_bitmap_bytes().len();
+        let fst_size = mock_fst().len();

        // first index
        let mut inverted_index = Vec::new();
-        inverted_index.extend_from_slice(&create_fake_bitmap()); // value bitmap
-        inverted_index.extend_from_slice(&create_fake_bitmap()); // null bitmap
-        inverted_index.extend_from_slice(&create_fake_fst()); // fst
+        inverted_index.extend_from_slice(&mock_bitmap_bytes()); // value bitmap
+        inverted_index.extend_from_slice(&mock_bitmap_bytes()); // null bitmap
+        inverted_index.extend_from_slice(&mock_fst()); // fst

        let meta = InvertedIndexMeta {
            name: "tag0".to_string(),
@@ -116,6 +124,7 @@ mod tests {
            null_bitmap_size: bitmap_size as _,
            relative_fst_offset: (bitmap_size * 2) as _,
            fst_size: fst_size as _,
+            bitmap_type: BitmapType::Roaring as _,
            ..Default::default()
        };

@@ -128,6 +137,7 @@ mod tests {
            null_bitmap_size: bitmap_size as _,
            relative_fst_offset: (bitmap_size * 2) as _,
            fst_size: fst_size as _,
+            bitmap_type: BitmapType::Roaring as _,
            ..Default::default()
        };

@@ -168,19 +178,19 @@ mod tests {
        let meta0 = metas.metas.get("tag0").unwrap();
        assert_eq!(meta0.name, "tag0");
        assert_eq!(meta0.base_offset, 0);
-        assert_eq!(meta0.inverted_index_size, 54);
-        assert_eq!(meta0.relative_null_bitmap_offset, 2);
-        assert_eq!(meta0.null_bitmap_size, 2);
-        assert_eq!(meta0.relative_fst_offset, 4);
+        assert_eq!(meta0.inverted_index_size, 102);
+        assert_eq!(meta0.relative_null_bitmap_offset, 26);
+        assert_eq!(meta0.null_bitmap_size, 26);
+        assert_eq!(meta0.relative_fst_offset, 52);
        assert_eq!(meta0.fst_size, 50);

        let meta1 = metas.metas.get("tag1").unwrap();
        assert_eq!(meta1.name, "tag1");
-        assert_eq!(meta1.base_offset, 54);
-        assert_eq!(meta1.inverted_index_size, 54);
-        assert_eq!(meta1.relative_null_bitmap_offset, 2);
-        assert_eq!(meta1.null_bitmap_size, 2);
-        assert_eq!(meta1.relative_fst_offset, 4);
+        assert_eq!(meta1.base_offset, 102);
+        assert_eq!(meta1.inverted_index_size, 102);
+        assert_eq!(meta1.relative_null_bitmap_offset, 26);
+        assert_eq!(meta1.null_bitmap_size, 26);
+        assert_eq!(meta1.relative_fst_offset, 52);
        assert_eq!(meta1.fst_size, 50);
    }

@@ -224,17 +234,29 @@ mod tests {
        let metas = blob_reader.metadata().await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();

-        let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
-        assert_eq!(bitmap.into_vec(), create_fake_bitmap());
-        let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
-        assert_eq!(bitmap.into_vec(), create_fake_bitmap());
+        let bitmap = blob_reader
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, mock_bitmap());
+        let bitmap = blob_reader
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, mock_bitmap());

        let metas = blob_reader.metadata().await.unwrap();
        let meta = metas.metas.get("tag1").unwrap();

-        let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
-        assert_eq!(bitmap.into_vec(), create_fake_bitmap());
-        let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
-        assert_eq!(bitmap.into_vec(), create_fake_bitmap());
+        let bitmap = blob_reader
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, mock_bitmap());
+        let bitmap = blob_reader
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, mock_bitmap());
    }
 }
--- a/src/index/src/inverted_index/format/writer.rs
+++ b/src/index/src/inverted_index/format/writer.rs
@@ -18,14 +18,14 @@ mod single;
 use std::num::NonZeroUsize;

 use async_trait::async_trait;
-use common_base::BitVec;
 use futures::Stream;

+use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::error::Result;
 pub use crate::inverted_index::format::writer::blob::InvertedIndexBlobWriter;
 use crate::Bytes;

-pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;
+pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;

 /// Trait for writing inverted index data to underlying storage.
 #[mockall::automock]
@@ -37,11 +37,13 @@ pub trait InvertedIndexWriter: Send {
    /// * `null_bitmap` marks positions of null entries.
    /// * `values` is a stream of values and their locations, yielded lexicographically.
    ///    Errors occur if the values are out of order.
+    /// * `bitmap_type` is the type of bitmap to encode.
    async fn add_index(
        &mut self,
        name: String,
-        null_bitmap: BitVec,
+        null_bitmap: Bitmap,
        values: ValueStream,
+        bitmap_type: BitmapType,
    ) -> Result<()>;

    /// Finalizes the index writing process, ensuring all data is written.
--- a/src/index/src/inverted_index/format/writer/blob.rs
+++ b/src/index/src/inverted_index/format/writer/blob.rs
@@ -15,12 +15,12 @@
 use std::num::NonZeroUsize;

 use async_trait::async_trait;
-use common_base::BitVec;
 use futures::{AsyncWrite, AsyncWriteExt};
 use greptime_proto::v1::index::InvertedIndexMetas;
 use prost::Message;
 use snafu::ResultExt;

+use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::error::{CloseSnafu, FlushSnafu, Result, WriteSnafu};
 use crate::inverted_index::format::writer::single::SingleIndexWriter;
 use crate::inverted_index::format::writer::{InvertedIndexWriter, ValueStream};
@@ -43,8 +43,9 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
    async fn add_index(
        &mut self,
        name: String,
-        null_bitmap: BitVec,
+        null_bitmap: Bitmap,
        values: ValueStream,
+        bitmap_type: BitmapType,
    ) -> Result<()> {
        let single_writer = SingleIndexWriter::new(
            name.clone(),
@@ -52,6 +53,7 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
            null_bitmap,
            values,
            &mut self.blob_writer,
+            bitmap_type,
        );
        let metadata = single_writer.write().await?;

@@ -100,6 +102,7 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexBlobWriter<W> {
 #[cfg(test)]
 mod tests {
    use futures::stream;
+    use greptime_proto::v1::index::BitmapType;

    use super::*;
    use crate::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
@@ -132,24 +135,44 @@ mod tests {
        writer
            .add_index(
                "tag0".to_string(),
-                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
+                Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
                Box::new(stream::iter(vec![
-                    Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
-                    Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
-                    Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((
+                        Bytes::from("a"),
+                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                    )),
+                    Ok((
+                        Bytes::from("b"),
+                        Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
+                    )),
+                    Ok((
+                        Bytes::from("c"),
+                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                    )),
                ])),
+                BitmapType::Roaring,
            )
            .await
            .unwrap();
        writer
            .add_index(
                "tag1".to_string(),
-                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
+                Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
                Box::new(stream::iter(vec![
-                    Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
-                    Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
-                    Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((
+                        Bytes::from("x"),
+                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                    )),
+                    Ok((
+                        Bytes::from("y"),
+                        Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
+                    )),
+                    Ok((
+                        Bytes::from("z"),
+                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                    )),
                ])),
+                BitmapType::Roaring,
            )
            .await
            .unwrap();
@@ -181,22 +204,31 @@ mod tests {
        assert_eq!(fst0.len(), 3);
        let [offset, size] = unpack(fst0.get(b"a").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size)
+            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
            .await
            .unwrap();
-        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+        assert_eq!(
+            bitmap,
+            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
+        );
        let [offset, size] = unpack(fst0.get(b"b").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size)
+            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
            .await
            .unwrap();
-        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
+        assert_eq!(
+            bitmap,
+            Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
+        );
        let [offset, size] = unpack(fst0.get(b"c").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size)
+            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
            .await
            .unwrap();
-        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+        assert_eq!(
+            bitmap,
+            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
+        );

        // tag1
        let tag1 = metadata.metas.get("tag1").unwrap();
@@ -215,21 +247,30 @@ mod tests {
        assert_eq!(fst1.len(), 3);
        let [offset, size] = unpack(fst1.get(b"x").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size)
+            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
            .await
            .unwrap();
-        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+        assert_eq!(
+            bitmap,
+            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
+        );
        let [offset, size] = unpack(fst1.get(b"y").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size)
+            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
            .await
            .unwrap();
-        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
+        assert_eq!(
+            bitmap,
+            Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
+        );
        let [offset, size] = unpack(fst1.get(b"z").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size)
+            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
            .await
            .unwrap();
-        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+        assert_eq!(
+            bitmap,
+            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
+        );
    }
 }
--- a/src/index/src/inverted_index/format/writer/single.rs
+++ b/src/index/src/inverted_index/format/writer/single.rs
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use common_base::BitVec;
 use fst::MapBuilder;
 use futures::{AsyncWrite, AsyncWriteExt, Stream, StreamExt};
 use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexStats};
 use snafu::ResultExt;

+use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::error::{FstCompileSnafu, FstInsertSnafu, Result, WriteSnafu};
 use crate::Bytes;

@@ -27,7 +27,7 @@ pub struct SingleIndexWriter<W, S> {
    blob_writer: W,

    /// The null bitmap to be written
-    null_bitmap: BitVec,
+    null_bitmap: Bitmap,

    /// The stream of values to be written, yielded lexicographically
    values: S,
@@ -37,30 +37,40 @@ pub struct SingleIndexWriter<W, S> {

    /// Metadata about the index
    meta: InvertedIndexMeta,
+
+    /// The type of bitmap to use
+    bitmap_type: BitmapType,
+
+    /// Buffer for writing the blob
+    buf: Vec<u8>,
 }

 impl<W, S> SingleIndexWriter<W, S>
 where
    W: AsyncWrite + Send + Unpin,
-    S: Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin,
+    S: Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin,
 {
    /// Constructs a new `SingleIndexWriter`
    pub fn new(
        name: String,
        base_offset: u64,
-        null_bitmap: BitVec,
+        null_bitmap: Bitmap,
        values: S,
        blob_writer: W,
+        bitmap_type: BitmapType,
    ) -> SingleIndexWriter<W, S> {
        SingleIndexWriter {
            blob_writer,
            null_bitmap,
            values,
            fst: MapBuilder::memory(),
+            bitmap_type,
+            buf: Vec::new(),
            meta: InvertedIndexMeta {
                name,
                base_offset,
                stats: Some(InvertedIndexStats::default()),
+                bitmap_type: bitmap_type.into(),
                ..Default::default()
            },
        }
@@ -80,14 +90,17 @@ where

    /// Writes the null bitmap to the blob and updates the metadata accordingly
    async fn write_null_bitmap(&mut self) -> Result<()> {
-        let null_bitmap_bytes = self.null_bitmap.as_raw_slice();
+        self.buf.clear();
+        self.null_bitmap
+            .serialize_into(self.bitmap_type, &mut self.buf)
+            .expect("Write to vec should not fail");
        self.blob_writer
-            .write_all(null_bitmap_bytes)
+            .write_all(&self.buf)
            .await
            .context(WriteSnafu)?;

        self.meta.relative_null_bitmap_offset = self.meta.inverted_index_size as _;
-        self.meta.null_bitmap_size = null_bitmap_bytes.len() as _;
+        self.meta.null_bitmap_size = self.buf.len() as _;
        self.meta.inverted_index_size += self.meta.null_bitmap_size as u64;

        // update stats
@@ -100,15 +113,18 @@ where
    }

    /// Appends a value and its bitmap to the blob, updates the FST, and the metadata
-    async fn append_value(&mut self, value: Bytes, bitmap: BitVec) -> Result<()> {
-        let bitmap_bytes = bitmap.into_vec();
+    async fn append_value(&mut self, value: Bytes, bitmap: Bitmap) -> Result<()> {
+        self.buf.clear();
+        bitmap
+            .serialize_into(self.bitmap_type, &mut self.buf)
+            .expect("Write to vec should not fail");
        self.blob_writer
-            .write_all(&bitmap_bytes)
+            .write_all(&self.buf)
            .await
            .context(WriteSnafu)?;

        let offset = self.meta.inverted_index_size as u32;
-        let size = bitmap_bytes.len() as u32;
+        let size = self.buf.len() as u32;
        self.meta.inverted_index_size += size as u64;

        let packed = bytemuck::cast::<[u32; 2], u64>([offset, size]);
@@ -157,9 +173,10 @@ mod tests {
        let writer = SingleIndexWriter::new(
            "test".to_string(),
            0,
-            BitVec::new(),
+            Bitmap::new_roaring(),
            stream::empty(),
            &mut blob,
+            BitmapType::Roaring,
        );

        let meta = writer.write().await.unwrap();
@@ -174,13 +191,23 @@ mod tests {
        let writer = SingleIndexWriter::new(
            "test".to_string(),
            0,
-            BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
+            Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
            stream::iter(vec![
-                Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
-                Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
-                Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
+                Ok((
+                    Bytes::from("a"),
+                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                )),
+                Ok((
+                    Bytes::from("b"),
+                    Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
+                )),
+                Ok((
+                    Bytes::from("c"),
+                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                )),
            ]),
            &mut blob,
+            BitmapType::Roaring,
        );
        let meta = writer.write().await.unwrap();

@@ -199,13 +226,23 @@ mod tests {
        let writer = SingleIndexWriter::new(
            "test".to_string(),
            0,
-            BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
+            Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
            stream::iter(vec![
-                Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
-                Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
-                Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
+                Ok((
+                    Bytes::from("b"),
+                    Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
+                )),
+                Ok((
+                    Bytes::from("a"),
+                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                )),
+                Ok((
+                    Bytes::from("c"),
+                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
+                )),
            ]),
            &mut blob,
+            BitmapType::Roaring,
        );
        let res = writer.write().await;
        assert!(matches!(res, Err(Error::FstInsert { .. })));
--- a/src/index/src/inverted_index/search/fst_values_mapper.rs
+++ b/src/index/src/inverted_index/search/fst_values_mapper.rs
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use common_base::BitVec;
-use greptime_proto::v1::index::InvertedIndexMeta;
+use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};

+use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
 use crate::inverted_index::format::reader::InvertedIndexReader;

@@ -36,7 +36,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
    pub async fn map_values_vec(
        &mut self,
        value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
-    ) -> Result<Vec<BitVec>> {
+    ) -> Result<Vec<Bitmap>> {
        let groups = value_and_meta_vec
            .iter()
            .map(|(values, _)| values.len())
@@ -50,15 +50,17 @@ impl<'a> ParallelFstValuesMapper<'a> {
                // bitmap offset and the lower 32 bits represent its size. This mapper uses these
                // combined offset-size pairs to fetch and union multiple bitmaps into a single `BitVec`.
                let [relative_offset, size] = bytemuck::cast::<u64, [u32; 2]>(*value);
-                fetch_ranges.push(
-                    meta.base_offset + relative_offset as u64
-                        ..meta.base_offset + relative_offset as u64 + size as u64,
-                );
+                let range = meta.base_offset + relative_offset as u64
+                    ..meta.base_offset + relative_offset as u64 + size as u64;
+                fetch_ranges.push((
+                    range,
+                    BitmapType::try_from(meta.bitmap_type).unwrap_or(BitmapType::BitVec),
+                ));
            }
        }

        if fetch_ranges.is_empty() {
-            return Ok(vec![BitVec::new()]);
+            return Ok(vec![Bitmap::new_bitvec()]);
        }

        common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
@@ -66,14 +68,10 @@ impl<'a> ParallelFstValuesMapper<'a> {
        let mut output = Vec::with_capacity(groups.len());

        for counter in groups {
-            let mut bitmap = BitVec::new();
+            let mut bitmap = Bitmap::new_roaring();
            for _ in 0..counter {
                let bm = bitmaps.pop_front().unwrap();
-                if bm.len() > bitmap.len() {
-                    bitmap = bm | bitmap
-                } else {
-                    bitmap |= bm
-                }
+                bitmap.union(bm);
            }

            output.push(bitmap);
@@ -87,8 +85,6 @@ impl<'a> ParallelFstValuesMapper<'a> {
 mod tests {
    use std::collections::VecDeque;

-    use common_base::bit_vec::prelude::*;
-
    use super::*;
    use crate::inverted_index::format::reader::MockInvertedIndexReader;

@@ -101,19 +97,26 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader.expect_bitmap_deque().returning(|ranges| {
            let mut output = VecDeque::new();
-            for range in ranges {
+            for (range, bitmap_type) in ranges {
                let offset = range.start;
                let size = range.end - range.start;
-                match (offset, size) {
-                    (1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]),
-                    (2, 1) => output.push_back(bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]),
+                match (offset, size, bitmap_type) {
+                    (1, 1, BitmapType::Roaring) => {
+                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                    }
+                    (2, 1, BitmapType::Roaring) => {
+                        output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
+                    }
                    _ => unreachable!(),
                }
            }
            Ok(output)
        });

-        let meta = InvertedIndexMeta::default();
+        let meta = InvertedIndexMeta {
+            bitmap_type: BitmapType::Roaring.into(),
+            ..Default::default()
+        };
        let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);

        let result = values_mapper
@@ -126,33 +129,50 @@ mod tests {
            .map_values_vec(&[(vec![value(1, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
+        assert_eq!(
+            result[0],
+            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
+        );

        let result = values_mapper
            .map_values_vec(&[(vec![value(2, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);
+        assert_eq!(
+            result[0],
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
+        );

        let result = values_mapper
            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
+        assert_eq!(
+            result[0],
+            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
+        );

        let result = values_mapper
            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
+        assert_eq!(
+            result[0],
+            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
+        );

        let result = values_mapper
            .map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);
-        assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
-
+        assert_eq!(
+            result[0],
+            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
+        );
+        assert_eq!(
+            result[1],
+            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
+        );
        let result = values_mapper
            .map_values_vec(&[
                (vec![value(2, 1), value(1, 1)], &meta),
@@ -160,7 +180,13 @@ mod tests {
            ])
            .await
            .unwrap();
-        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
-        assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
+        assert_eq!(
+            result[0],
+            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
+        );
+        assert_eq!(
+            result[1],
+            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
+        );
    }
 }
--- a/src/index/src/inverted_index/search/index_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply.rs
@@ -15,17 +15,17 @@
 mod predicates_apply;

 use async_trait::async_trait;
-use common_base::BitVec;
 pub use predicates_apply::PredicatesIndexApplier;

+use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
 use crate::inverted_index::format::reader::InvertedIndexReader;

 /// The output of an apply operation.
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, PartialEq)]
 pub struct ApplyOutput {
    /// Bitmap of indices that match the predicates.
-    pub matched_segment_ids: BitVec,
+    pub matched_segment_ids: Bitmap,

    /// The total number of rows in the index.
    pub total_row_count: usize,
--- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
@@ -15,9 +15,9 @@
 use std::mem::size_of;

 use async_trait::async_trait;
-use common_base::BitVec;
 use greptime_proto::v1::index::InvertedIndexMetas;

+use crate::bitmap::Bitmap;
 use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
 use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::search::fst_apply::{
@@ -50,12 +50,11 @@ impl IndexApplier for PredicatesIndexApplier {
    ) -> Result<ApplyOutput> {
        let metadata = reader.metadata().await?;
        let mut output = ApplyOutput {
-            matched_segment_ids: BitVec::EMPTY,
+            matched_segment_ids: Bitmap::new_bitvec(),
            total_row_count: metadata.total_row_count as _,
            segment_row_count: metadata.segment_row_count as _,
        };

-        let mut bitmap = Self::bitmap_full_range(&metadata);
        // TODO(zhongzc): optimize the order of applying to make it quicker to return empty.
        let mut appliers = Vec::with_capacity(self.fst_appliers.len());
        let mut fst_ranges = Vec::with_capacity(self.fst_appliers.len());
@@ -81,7 +80,7 @@ impl IndexApplier for PredicatesIndexApplier {
        }

        if fst_ranges.is_empty() {
-            output.matched_segment_ids = bitmap;
+            output.matched_segment_ids = Self::bitmap_full_range(&metadata);
            return Ok(output);
        }

@@ -93,14 +92,15 @@ impl IndexApplier for PredicatesIndexApplier {
            .collect::<Vec<_>>();

        let mut mapper = ParallelFstValuesMapper::new(reader);
-        let bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
+        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;

+        let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
        for bm in bm_vec {
-            if bitmap.count_ones() == 0 {
+            if bm.count_ones() == 0 {
                break;
            }

-            bitmap &= bm;
+            bitmap.intersect(bm);
        }

        output.matched_segment_ids = bitmap;
@@ -146,12 +146,12 @@ impl PredicatesIndexApplier {
        Ok(PredicatesIndexApplier { fst_appliers })
    }

-    /// Creates a `BitVec` representing the full range of data in the index for initial scanning.
-    fn bitmap_full_range(metadata: &InvertedIndexMetas) -> BitVec {
+    /// Creates a `Bitmap` representing the full range of data in the index for initial scanning.
+    fn bitmap_full_range(metadata: &InvertedIndexMetas) -> Bitmap {
        let total_count = metadata.total_row_count;
        let segment_count = metadata.segment_row_count;
        let len = total_count.div_ceil(segment_count);
-        BitVec::repeat(true, len as _)
+        Bitmap::full_bitvec(len as _)
    }
 }

@@ -167,10 +167,10 @@ mod tests {
    use std::collections::VecDeque;
    use std::sync::Arc;

-    use common_base::bit_vec::prelude::*;
-    use greptime_proto::v1::index::InvertedIndexMeta;
+    use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};

    use super::*;
+    use crate::bitmap::Bitmap;
    use crate::inverted_index::error::Error;
    use crate::inverted_index::format::reader::MockInvertedIndexReader;
    use crate::inverted_index::search::fst_apply::MockFstApplier;
@@ -190,6 +190,7 @@ mod tests {
            let meta = InvertedIndexMeta {
                name: s(tag),
                relative_fst_offset: idx,
+                bitmap_type: BitmapType::Roaring.into(),
                ..Default::default()
            };
            metas.metas.insert(s(tag), meta);
@@ -229,10 +230,16 @@ mod tests {
            .unwrap()])
        });

-        mock_reader.expect_bitmap_deque().returning(|range| {
-            assert_eq!(range.len(), 1);
-            assert_eq!(range[0], 2..3);
-            Ok(VecDeque::from([bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]]))
+        mock_reader.expect_bitmap_deque().returning(|arg| {
+            assert_eq!(arg.len(), 1);
+            let range = &arg[0].0;
+            let bitmap_type = arg[0].1;
+            assert_eq!(*range, 2..3);
+            assert_eq!(bitmap_type, BitmapType::Roaring);
+            Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
+                &[0b10101010],
+                bitmap_type,
+            )]))
        });
        let output = applier
            .apply(SearchContext::default(), &mut mock_reader)
@@ -240,7 +247,7 @@ mod tests {
            .unwrap();
        assert_eq!(
            output.matched_segment_ids,
-            bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]
+            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
        );

        // An index reader with a single tag "tag-0" but without value "tag-0_value-0"
@@ -292,12 +299,16 @@ mod tests {
        });
        mock_reader.expect_bitmap_deque().returning(|ranges| {
            let mut output = VecDeque::new();
-            for range in ranges {
+            for (range, bitmap_type) in ranges {
                let offset = range.start;
                let size = range.end - range.start;
-                match (offset, size) {
-                    (1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]),
-                    (2, 1) => output.push_back(bitvec![u8, Lsb0; 1, 1, 0, 1, 1, 0, 1, 1]),
+                match (offset, size, bitmap_type) {
+                    (1, 1, BitmapType::Roaring) => {
+                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                    }
+                    (2, 1, BitmapType::Roaring) => {
+                        output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
+                    }
                    _ => unreachable!(),
                }
            }
@@ -311,7 +322,7 @@ mod tests {
            .unwrap();
        assert_eq!(
            output.matched_segment_ids,
-            bitvec![u8, Lsb0; 1, 0, 0, 0, 1, 0, 1, 0]
+            Bitmap::from_lsb0_bytes(&[0b10001010], BitmapType::Roaring)
        );
    }

@@ -330,10 +341,7 @@ mod tests {
            .apply(SearchContext::default(), &mut mock_reader)
            .await
            .unwrap();
-        assert_eq!(
-            output.matched_segment_ids,
-            bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
-        ); // full range to scan
+        assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
    }

    #[tokio::test]
@@ -405,10 +413,7 @@ mod tests {
            )
            .await
            .unwrap();
-        assert_eq!(
-            output.matched_segment_ids,
-            bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
-        );
+        assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8));
    }

    #[test]
--- a/src/index/src/lib.rs
+++ b/src/index/src/lib.rs
@@ -15,6 +15,7 @@
 #![feature(iter_partition_in_place)]
 #![feature(assert_matches)]

+pub mod bitmap;
 pub mod bloom_filter;
 pub mod error;
 pub mod external_provider;
--- a/src/log-store/src/error.rs
+++ b/src/log-store/src/error.rs
@@ -40,15 +40,17 @@ pub enum Error {
        actual: String,
    },

-    #[snafu(display("Failed to start log store gc task"))]
-    StartGcTask {
+    #[snafu(display("Failed to start log store task: {}", name))]
+    StartWalTask {
+        name: String,
        #[snafu(implicit)]
        location: Location,
        source: RuntimeError,
    },

-    #[snafu(display("Failed to stop log store gc task"))]
-    StopGcTask {
+    #[snafu(display("Failed to stop log store task: {}", name))]
+    StopWalTask {
+        name: String,
        #[snafu(implicit)]
        location: Location,
        source: RuntimeError,
--- a/src/log-store/src/raft_engine/backend.rs
+++ b/src/log-store/src/raft_engine/backend.rs
@@ -35,7 +35,7 @@ use common_runtime::RepeatedTask;
 use raft_engine::{Config, Engine, LogBatch, ReadableSize, RecoveryMode};
 use snafu::{IntoError, ResultExt};

-use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartGcTaskSnafu};
+use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartWalTaskSnafu};
 use crate::raft_engine::log_store::PurgeExpiredFilesFunction;

 pub(crate) const SYSTEM_NAMESPACE: u64 = 0;
@@ -93,7 +93,8 @@ impl RaftEngineBackend {
        );
        gc_task
            .start(common_runtime::global_runtime())
-            .context(StartGcTaskSnafu)?;
+            .context(StartWalTaskSnafu { name: "gc_task" })?;
+
        Ok(Self {
            engine: RwLock::new(engine),
            _gc_task: gc_task,
--- a/src/log-store/src/raft_engine/log_store.rs
+++ b/src/log-store/src/raft_engine/log_store.rs
@@ -14,7 +14,6 @@

 use std::collections::{hash_map, HashMap};
 use std::fmt::{Debug, Formatter};
-use std::sync::atomic::{AtomicI64, Ordering};
 use std::sync::Arc;
 use std::time::Duration;

@@ -32,7 +31,7 @@ use store_api::storage::RegionId;
 use crate::error::{
    AddEntryLogBatchSnafu, DiscontinuousLogIndexSnafu, Error, FetchEntrySnafu,
    IllegalNamespaceSnafu, IllegalStateSnafu, InvalidProviderSnafu, OverrideCompactedEntrySnafu,
-    RaftEngineSnafu, Result, StartGcTaskSnafu, StopGcTaskSnafu,
+    RaftEngineSnafu, Result, StartWalTaskSnafu, StopWalTaskSnafu,
 };
 use crate::metrics;
 use crate::raft_engine::backend::SYSTEM_NAMESPACE;
@@ -46,7 +45,7 @@ pub struct RaftEngineLogStore {
    read_batch_size: usize,
    engine: Arc<Engine>,
    gc_task: RepeatedTask<Error>,
-    last_sync_time: AtomicI64,
+    sync_task: RepeatedTask<Error>,
 }

 pub struct PurgeExpiredFilesFunction {
@@ -83,6 +82,31 @@ impl TaskFunction<Error> for PurgeExpiredFilesFunction {
    }
 }

+pub struct SyncWalTaskFunction {
+    engine: Arc<Engine>,
+}
+
+#[async_trait::async_trait]
+impl TaskFunction<Error> for SyncWalTaskFunction {
+    async fn call(&mut self) -> std::result::Result<(), Error> {
+        let engine = self.engine.clone();
+        if let Err(e) = tokio::task::spawn_blocking(move || engine.sync()).await {
+            error!(e; "Failed to sync raft engine log files");
+        };
+        Ok(())
+    }
+
+    fn name(&self) -> &str {
+        "SyncWalTaskFunction"
+    }
+}
+
+impl SyncWalTaskFunction {
+    pub fn new(engine: Arc<Engine>) -> Self {
+        Self { engine }
+    }
+}
+
 impl RaftEngineLogStore {
    pub async fn try_new(dir: String, config: &RaftEngineConfig) -> Result<Self> {
        let raft_engine_config = Config {
@@ -104,13 +128,18 @@ impl RaftEngineLogStore {
            }),
        );

+        let sync_task = RepeatedTask::new(
+            config.sync_period.unwrap_or(Duration::from_secs(5)),
+            Box::new(SyncWalTaskFunction::new(engine.clone())),
+        );
+
        let log_store = Self {
            sync_write: config.sync_write,
            sync_period: config.sync_period,
            read_batch_size: config.read_batch_size,
            engine,
            gc_task,
-            last_sync_time: AtomicI64::new(0),
+            sync_task,
        };
        log_store.start()?;
        Ok(log_store)
@@ -123,7 +152,10 @@ impl RaftEngineLogStore {
    fn start(&self) -> Result<()> {
        self.gc_task
            .start(common_runtime::global_runtime())
-            .context(StartGcTaskSnafu)
+            .context(StartWalTaskSnafu { name: "gc_task" })?;
+        self.sync_task
+            .start(common_runtime::global_runtime())
+            .context(StartWalTaskSnafu { name: "sync_task" })
    }

    fn span(&self, provider: &RaftEngineProvider) -> (Option<u64>, Option<u64>) {
@@ -220,7 +252,14 @@ impl LogStore for RaftEngineLogStore {
    type Error = Error;

    async fn stop(&self) -> Result<()> {
-        self.gc_task.stop().await.context(StopGcTaskSnafu)
+        self.gc_task
+            .stop()
+            .await
+            .context(StopWalTaskSnafu { name: "gc_task" })?;
+        self.sync_task
+            .stop()
+            .await
+            .context(StopWalTaskSnafu { name: "sync_task" })
    }

    /// Appends a batch of entries to logstore. `RaftEngineLogStore` assures the atomicity of
@@ -240,20 +279,9 @@ impl LogStore for RaftEngineLogStore {
        }

        let (mut batch, last_entry_ids) = self.entries_to_batch(entries)?;
-
-        let mut sync = self.sync_write;
-
-        if let Some(sync_period) = &self.sync_period {
-            let now = common_time::util::current_time_millis();
-            if now - self.last_sync_time.load(Ordering::Relaxed) >= sync_period.as_millis() as i64 {
-                self.last_sync_time.store(now, Ordering::Relaxed);
-                sync = true;
-            }
-        }
-
        let _ = self
            .engine
-            .write(&mut batch, sync)
+            .write(&mut batch, self.sync_write)
            .context(RaftEngineSnafu)?;

        Ok(AppendBatchResponse { last_entry_ids })
--- a/src/meta-client/examples/meta_client.rs
+++ b/src/meta-client/examples/meta_client.rs
@@ -31,13 +31,13 @@ fn main() {

 #[tokio::main]
 async fn run() {
-    let id = (1000u64, 2000u64);
+    let id = 2000u64;
    let config = ChannelConfig::new()
        .timeout(Duration::from_secs(3))
        .connect_timeout(Duration::from_secs(5))
        .tcp_nodelay(true);
    let channel_manager = ChannelManager::with_config(config);
-    let mut meta_client = MetaClientBuilder::datanode_default_options(id.0, id.1)
+    let mut meta_client = MetaClientBuilder::datanode_default_options(id)
        .channel_manager(channel_manager)
        .build();
    meta_client.start(&["127.0.0.1:3002"]).await.unwrap();
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -47,7 +47,6 @@ use common_meta::rpc::store::{
    DeleteRangeResponse, PutRequest, PutResponse, RangeRequest, RangeResponse,
 };
 use common_meta::rpc::KeyValue;
-use common_meta::ClusterId;
 use common_telemetry::info;
 use futures::TryStreamExt;
 use heartbeat::Client as HeartbeatClient;
@@ -61,7 +60,7 @@ use crate::error::{
    Result,
 };

-pub type Id = (u64, u64);
+pub type Id = u64;

 const DEFAULT_ASK_LEADER_MAX_RETRY: usize = 3;
 const DEFAULT_SUBMIT_DDL_MAX_RETRY: usize = 3;
@@ -81,18 +80,18 @@ pub struct MetaClientBuilder {
 }

 impl MetaClientBuilder {
-    pub fn new(cluster_id: ClusterId, member_id: u64, role: Role) -> Self {
+    pub fn new(member_id: u64, role: Role) -> Self {
        Self {
-            id: (cluster_id, member_id),
+            id: member_id,
            role,
            ..Default::default()
        }
    }

    /// Returns the role of Frontend's default options.
-    pub fn frontend_default_options(cluster_id: ClusterId) -> Self {
+    pub fn frontend_default_options() -> Self {
        // Frontend does not need a member id.
-        Self::new(cluster_id, 0, Role::Frontend)
+        Self::new(0, Role::Frontend)
            .enable_store()
            .enable_heartbeat()
            .enable_procedure()
@@ -100,18 +99,19 @@ impl MetaClientBuilder {
    }

    /// Returns the role of Datanode's default options.
-    pub fn datanode_default_options(cluster_id: ClusterId, member_id: u64) -> Self {
-        Self::new(cluster_id, member_id, Role::Datanode)
+    pub fn datanode_default_options(member_id: u64) -> Self {
+        Self::new(member_id, Role::Datanode)
            .enable_store()
            .enable_heartbeat()
    }

    /// Returns the role of Flownode's default options.
-    pub fn flownode_default_options(cluster_id: ClusterId, member_id: u64) -> Self {
-        Self::new(cluster_id, member_id, Role::Flownode)
+    pub fn flownode_default_options(member_id: u64) -> Self {
+        Self::new(member_id, Role::Flownode)
            .enable_store()
            .enable_heartbeat()
            .enable_procedure()
+            .enable_access_cluster_info()
    }

    pub fn enable_heartbeat(self) -> Self {
@@ -273,15 +273,9 @@ impl ClusterInfo for MetaClient {
        let cluster_client = self.cluster_client()?;

        let (get_metasrv_nodes, nodes_key_prefix) = match role {
-            None => (
-                true,
-                Some(NodeInfoKey::key_prefix_with_cluster_id(self.id.0)),
-            ),
+            None => (true, Some(NodeInfoKey::key_prefix())),
            Some(ClusterRole::Metasrv) => (true, None),
-            Some(role) => (
-                false,
-                Some(NodeInfoKey::key_prefix_with_role(self.id.0, role)),
-            ),
+            Some(role) => (false, Some(NodeInfoKey::key_prefix_with_role(role))),
        };

        let mut nodes = if get_metasrv_nodes {
@@ -324,7 +318,7 @@ impl ClusterInfo for MetaClient {

    async fn list_region_stats(&self) -> Result<Vec<RegionStat>> {
        let cluster_kv_backend = Arc::new(self.cluster_client()?);
-        let range_prefix = DatanodeStatKey::key_prefix_with_cluster_id(self.id.0);
+        let range_prefix = DatanodeStatKey::prefix_key();
        let req = RangeRequest::new().with_prefix(range_prefix);
        let stream =
            PaginationStream::new(cluster_kv_backend, req, 256, decode_stats).into_stream();
@@ -555,6 +549,8 @@ impl MetaClient {

 #[cfg(test)]
 mod tests {
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
    use api::v1::meta::{HeartbeatRequest, Peer};
    use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef};
    use rand::Rng;
@@ -624,31 +620,31 @@ mod tests {
    async fn test_meta_client_builder() {
        let urls = &["127.0.0.1:3001", "127.0.0.1:3002"];

-        let mut meta_client = MetaClientBuilder::new(0, 0, Role::Datanode)
+        let mut meta_client = MetaClientBuilder::new(0, Role::Datanode)
            .enable_heartbeat()
            .build();
        let _ = meta_client.heartbeat_client().unwrap();
        assert!(meta_client.store_client().is_err());
        meta_client.start(urls).await.unwrap();

-        let mut meta_client = MetaClientBuilder::new(0, 0, Role::Datanode).build();
+        let mut meta_client = MetaClientBuilder::new(0, Role::Datanode).build();
        assert!(meta_client.heartbeat_client().is_err());
        assert!(meta_client.store_client().is_err());
        meta_client.start(urls).await.unwrap();

-        let mut meta_client = MetaClientBuilder::new(0, 0, Role::Datanode)
+        let mut meta_client = MetaClientBuilder::new(0, Role::Datanode)
            .enable_store()
            .build();
        assert!(meta_client.heartbeat_client().is_err());
        let _ = meta_client.store_client().unwrap();
        meta_client.start(urls).await.unwrap();

-        let mut meta_client = MetaClientBuilder::new(1, 2, Role::Datanode)
+        let mut meta_client = MetaClientBuilder::new(2, Role::Datanode)
            .enable_heartbeat()
            .enable_store()
            .build();
-        assert_eq!(1, meta_client.id().0);
-        assert_eq!(2, meta_client.id().1);
+        assert_eq!(2, meta_client.id());
+        assert_eq!(2, meta_client.id());
        let _ = meta_client.heartbeat_client().unwrap();
        let _ = meta_client.store_client().unwrap();
        meta_client.start(urls).await.unwrap();
@@ -657,7 +653,7 @@ mod tests {
    #[tokio::test]
    async fn test_not_start_heartbeat_client() {
        let urls = &["127.0.0.1:3001", "127.0.0.1:3002"];
-        let mut meta_client = MetaClientBuilder::new(0, 0, Role::Datanode)
+        let mut meta_client = MetaClientBuilder::new(0, Role::Datanode)
            .enable_store()
            .build();
        meta_client.start(urls).await.unwrap();
@@ -668,7 +664,7 @@ mod tests {
    #[tokio::test]
    async fn test_not_start_store_client() {
        let urls = &["127.0.0.1:3001", "127.0.0.1:3002"];
-        let mut meta_client = MetaClientBuilder::new(0, 0, Role::Datanode)
+        let mut meta_client = MetaClientBuilder::new(0, Role::Datanode)
            .enable_heartbeat()
            .build();

@@ -688,6 +684,9 @@ mod tests {
        let tc = new_client("test_heartbeat").await;
        let (sender, mut receiver) = tc.client.heartbeat().await.unwrap();
        // send heartbeats
+
+        let request_sent = Arc::new(AtomicUsize::new(0));
+        let request_sent_clone = request_sent.clone();
        let _handle = tokio::spawn(async move {
            for _ in 0..5 {
                let req = HeartbeatRequest {
@@ -698,14 +697,24 @@ mod tests {
                    ..Default::default()
                };
                sender.send(req).await.unwrap();
+                request_sent_clone.fetch_add(1, Ordering::Relaxed);
            }
        });

-        let _handle = tokio::spawn(async move {
-            while let Some(res) = receiver.message().await.unwrap() {
-                assert_eq!(1000, res.header.unwrap().cluster_id);
+        let heartbeat_count = Arc::new(AtomicUsize::new(0));
+        let heartbeat_count_clone = heartbeat_count.clone();
+        let handle = tokio::spawn(async move {
+            while let Some(_resp) = receiver.message().await.unwrap() {
+                heartbeat_count_clone.fetch_add(1, Ordering::Relaxed);
            }
        });
+
+        handle.await.unwrap();
+        //+1 for the initial response
+        assert_eq!(
+            request_sent.load(Ordering::Relaxed) + 1,
+            heartbeat_count.load(Ordering::Relaxed)
+        );
    }

    #[tokio::test]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
discord9	37b4d6f682	fix: drop arc explictly	2025-04-10 15:17:22 +08:00
discord9	a175c22297	fix: conn pool leak & placeholder feature so ci can compile	2025-04-10 14:47:17 +08:00
discord9	2974902b8b	fix: time window filter expr use OR	2025-03-27 15:38:35 +08:00
discord9	9fcb0874e2	fix: convert timestamp unit too	2025-03-13 16:33:14 +08:00
discord9	f04f16cec3	fix: quote&more info when time window too many chore: even more warning fix: filter first warn later	2025-03-13 15:29:10 +08:00
discord9	07c0f1d546	chore: after rebase fix	2025-03-11 15:08:06 +08:00
discord9	fc7b38af05	fix: subquery&cte time window expr	2025-03-11 14:56:08 +08:00
discord9	51c7539ecb	refactor: even finer&limit time window num	2025-03-11 14:56:08 +08:00
discord9	09039648d5	feat: basic time window aware	2025-03-11 14:56:08 +08:00
discord9	01c9c5a3d0	metrics: better bucket&longer timeout	2025-03-11 14:56:08 +08:00
discord9	c3bc69a784	fix: timeout	2025-03-11 14:56:08 +08:00
discord9	d5f9630641	fix: heartbeat&expire_after unit	2025-03-11 14:56:08 +08:00
discord9	38e56340db	feat: time window in df plan WIP test: found out time window expr chore: pub tests: also unparsed tests: rm dup code feat: frontend client for recording rule fix: bound edgecase WIP WIP feat: rule engine feat: add init options& tmp rerounte to rule fix: dist client get fix: also not handle mirror write in flownode chore: clippy	2025-03-11 14:56:08 +08:00
Ning Sun	d0254f9705	feat: update promql-parser to 0.5 for duration literal (#5682 )	2025-03-11 06:27:36 +00:00
Ning Sun	8a86903c73	feat: add description for each grafana panel (#5673 ) * feat: add description for each grafana panel * Apply suggestions from code review Co-authored-by: Yingwen <realevenyag@gmail.com> * fix: unit of write stall * feat: add jq script to summary the grafana dashboard * fix: update description * ci: add ci step to valid grafana and send summary as comment * ci: update check * ci: update ci --------- Co-authored-by: Yingwen <realevenyag@gmail.com>	2025-03-11 06:16:49 +00:00
Weny Xu	0bd322a078	perf(prom): optimize label values query (#5653 ) perf: optimize label values query	2025-03-10 13:20:47 +00:00
discord9	3811e3f632	feat: also get index file&expose mito in metrics (#5680 ) * feat: download index file too * feat: expose mito in metrics * chore: fmt	2025-03-10 13:07:08 +00:00
localhost	c14aa176b5	chore: impl ref and ref_mut for json like (#5679 ) * chore: impl ref and ref_mut for json like * chore: add code source	2025-03-10 10:43:15 +00:00
Lei, HUANG	a922dcd9df	refactor(mito): move wal sync task to background (#5677 ) chore/move-wal-sync-to-bg: ### Refactor Log Store Task Management - Error Handling Enhancements: Updated error handling for task management in `error.rs` by renaming `StartGcTask` and `StopGcTask` to `StartWalTask` and `StopWalTask`, respectively, and added a `name` field for more descriptive error messages. - Task Management Improvements: Introduced `SyncWalTaskFunction` in `log_store.rs` to handle periodic synchronization of WAL tasks, replacing the previous atomic-based sync logic. - Backend Adjustments: Modified `backend.rs` to use the new `StartWalTaskSnafu` for starting tasks, ensuring consistency with the updated error handling approach.	2025-03-10 08:22:35 +00:00
dennis zhuang	530ff53422	feat(promql): supports quantile and count_values (#5652 ) * feat(promql): supports quantile * fix: merge_batch * chore: sqlness test * test: unit tests * feat: implements count_values * fix: typo * refactor: planner * chore: apply review suggestions --------- Co-authored-by: Yingwen <realevenyag@gmail.com>	2025-03-10 06:41:40 +00:00
Ruihang Xia	73ca39f37e	feat: time series distribution in scanner (#5675 ) * define distribution Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * feat: SeqScan support per series distribution * probe distribution Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * reverse sort order Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * more strict check Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * change null's ordering Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: evenyag <realevenyag@gmail.com>	2025-03-10 05:43:17 +00:00
Yingwen	0acc6b0354	fix: correct stalled count (#5678 )	2025-03-10 04:25:38 +00:00
Zhenchi	face361fcb	feat: introduce roaring bitmap to optimize sparse value scenarios (#5603 ) * feat: introduce roaring bitmap to optimize sparse value scenarios Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix taplo Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * address comments Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * polish Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * address comments Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2025-03-10 04:24:08 +00:00
Yingwen	9860bca986	feat: support exact filter on time index column (#5671 ) * feat: add predicate group * feat: pass predicate group * feat: memtable prune by time filters * test: test PruneTimeIterator with time filters * feat: push down returns exact for timestamp simple filters --------- Co-authored-by: Ruihang Xia <waynestxia@gmail.com>	2025-03-07 21:55:46 +00:00
ZonaHe	3a83c33a48	feat: update dashboard to v0.8.0 (#5666 ) Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com> Co-authored-by: Ning Sun <sunng@protonmail.com>	2025-03-07 19:47:02 +00:00
Ruihang Xia	373bd59b07	fix: update column requirements to use Column type instead of String (#5672 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-03-07 18:50:15 +00:00
shuiyisong	c8db4b286d	fix: use `DateTime` instead of `NaiveDateTime` (#5669 ) chore: use datetime instead of naivedatetime	2025-03-07 07:41:59 +00:00
Lei, HUANG	56c8c0651f	fix: skip schema check to avoid schema mismatch brought by metadata (#5662 ) * fix: skip schema check to avoid schema mismatch brought by metadata * docs: add some comment to remind me add that check back * test: add sqlness case * fix/skip-schema-check: ### Update CTE Test Cases - Added GRPC Latencies Test: Introduced a new test case for GRPC latencies in `cte.result` and `cte.sql` under `standalone/common/cte`. - Removed Redundant Test Files: Deleted `cte.result` and `cte.sql` under `standalone/common/range` as they were duplicates of the new test case.	2025-03-07 05:47:45 +00:00
shuiyisong	448e588fa7	chore: improve `/v1/jaeger/api/trace/{trace_id}`'s resp (#5663 ) * chore: improve jaeger trace api resp * chore: fix timestamp type * chore: fix timestamp type * chore: complete more fields * chore: change to microseconds * chore: add empty check & span status code * chore: minor update * chore: update test	2025-03-07 04:31:42 +00:00
Yingwen	f4cbf1d776	docs: update cluster dashboard to make opendal panel works (#5661 )	2025-03-07 02:49:15 +00:00
discord9	b35eefcf45	perf: rm coalesce batch when target_batch_size > fetch limit (#5658 ) * fix: rm coalesce > limit * fix: only rm one&test: sqlness	2025-03-07 02:45:07 +00:00
yihong	408dd55a2f	fix: flaky test in sqlness by fix random port (#5657 ) * fix: flaky test in sqlness by fix random port Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: typo Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: panic insead of forever loop Signed-off-by: yihong0618 <zouzou0208@gmail.com> --------- Signed-off-by: yihong0618 <zouzou0208@gmail.com>	2025-03-07 00:41:22 +00:00
Ruihang Xia	e463942a5b	fix: recover plan schema after dist analyzer (#5665 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-03-07 00:29:55 +00:00
discord9	0124a0d156	fix: window sort not apply when other column alias to time index name (#5634 ) * fix: other col alias to time index column handle * test: update sqlness * chore: per review * test: more sqlness * test: mv some to optimizer folder * fix: resolve alias properly * fix: also retain old name * chore: remove wrong comment * chore: fix sqlness * test: standalone/dist more projection diff	2025-03-06 08:05:57 +00:00
liyang	e23628a4e0	ci: bump dev-builder image version to 2024-12-25-a71b93dd-20250305072908 (#5651 )	2025-03-06 03:33:17 +00:00
Weny Xu	1d637cad51	fix(metric-engine): group DDL requests (#5628 ) * fix(metric-engine): group DDL requests * test: add sqlness tests * chore: apply suggestions from CR * chore: apply suggestions from CR	2025-03-05 09:17:47 +00:00
Lei, HUANG	a56030e6a5	refactor: remove cluster id field (#5610 ) * chore: resolve conflicts * chore: merge main * test: add compatibility test for DatanodeLeaseKey with missing cluster_id * test: add compatibility test for DatanodeLeaseKey without cluster_id * refactor/remove-cluster-id: - Update `greptime-proto` Dependency: Updated the `greptime-proto` dependency in `Cargo.lock` and `Cargo.toml` to a new revision. - Remove `cluster_id` Usage: Removed the `cluster_id` field and its related logic from various files, including `cluster.rs`, `datanode.rs`, `rpc.rs`, `adapter.rs`, `client.rs`, `ask_leader.rs`, `heartbeat.rs`, `procedure.rs`, `store.rs`, `handler.rs`, `response_header_handler.rs`, `key.rs`, `datanode.rs`, `lease.rs`, `metrics.rs`, `cluster.rs`, `heartbeat.rs`, `procedure.rs`, and `store.rs`. - Refactor Tests: Updated tests in `client.rs`, `response_header_handler.rs`, `store.rs`, and `service` modules to reflect the removal of `cluster_id`. * fix: clippy * refactor/remove-cluster-id: Refactor and Cleanup in Meta Server - `response_header_handler.rs`: Removed unused import of `HeartbeatResponse` and cleaned up the test function by eliminating the creation of an unused `HeartbeatResponse` object. - `node_lease.rs`: Simplified parameter handling in `HttpHandler` implementation by using an underscore for unused parameters. * refactor/remove-cluster-id: ### Remove `TableMetadataAllocatorContext` and Refactor Code - Removed `TableMetadataAllocatorContext`: Eliminated the `TableMetadataAllocatorContext` struct and its usage across multiple files, including `ddl.rs`, `create_table.rs`, `create_view.rs`, `table_meta.rs`, `test_util.rs`, `create_logical_tables.rs`, `drop_table.rs`, and `table_meta_alloc.rs`. - Refactored Function Signatures: Updated function signatures to remove the `TableMetadataAllocatorContext` parameter in methods like `create`, `create_view`, and `alloc` in `table_meta.rs` and `table_meta_alloc.rs`. - Updated Imports: Adjusted import statements to reflect the removal of `TableMetadataAllocatorContext` in affected files. These changes simplify the codebase by removing an unnecessary context struct and updating related function calls. * refactor/remove-cluster-id: ### Update `datanode.rs` to Modify Key Prefix - File Modified: `src/common/meta/src/datanode.rs` - Key Changes: - Updated `DatanodeStatKey::prefix_key` and `From<DatanodeStatKey>` to remove the cluster ID from the key prefix. - Adjusted comments to reflect the changes in key prefix handling. * reformat code * refactor/remove-cluster-id: ### Commit Summary - Refactor `Pusher` Initialization: Removed the `RequestHeader` parameter from the `Pusher::new` method across multiple files, including `handler.rs`, `test_util.rs`, and `heartbeat.rs`. This change simplifies the `Pusher` initialization process by eliminating th unnecessary parameter. - Update Imports: Adjusted import statements in `handler.rs` and `test_util.rs` to remove unused `RequestHeader` references, ensuring cleaner and more efficient code. * chore: update proto	2025-03-05 08:22:18 +00:00
liyang	a71b93dd84	fix: unable to install software-properties-common in dev builder (#5643 ) * fix: unable to install software-properties-common in dev builder * test dev builder * improve dev-build image * setup qemu action	2025-03-05 07:07:06 +00:00