chore: remove unused codes

feat: file parallel
docs: chore
2026-01-10 07:12:54 +00:00 · 2025-03-17 15:20:42 +08:00 · 2025-03-10 21:00:40 +08:00 · 2025-03-10 16:12:28 +08:00 · 2025-03-10 15:17:59 +08:00 · 2025-03-10 14:42:19 +08:00
125 changed files with 6380 additions and 8960 deletions
--- a/.github/workflows/grafana.yml
+++ b/.github/workflows/grafana.yml
@@ -1,52 +0,0 @@
-name: Check Grafana Panels
-
-on:
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'grafana/**'  # Trigger only when files under the grafana/ directory change
-
-jobs:
-  check-panels:
-    runs-on: ubuntu-latest
-
-    steps:
-      # Check out the repository
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      # Install jq (required for the script)
-      - name: Install jq
-        run: sudo apt-get install -y jq
-
-      # Make the check.sh script executable
-      - name: Make check.sh executable
-        run: chmod +x grafana/check.sh
-
-      # Run the check.sh script
-      - name: Run check.sh
-        run: ./grafana/check.sh
-
-      # Only run summary.sh for pull_request events (not for merge queues or final pushes)
-      - name: Check if this is a pull request
-        id: check-pr
-        run: |
-          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
-            echo "is_pull_request=true" >> $GITHUB_OUTPUT
-          else
-            echo "is_pull_request=false" >> $GITHUB_OUTPUT
-          fi
-
-      # Make the summary.sh script executable
-      - name: Make summary.sh executable
-        if: steps.check-pr.outputs.is_pull_request == 'true'
-        run: chmod +x grafana/summary.sh
-
-      # Run the summary.sh script and add its output to the GitHub Job Summary
-      - name: Run summary.sh and add to Job Summary
-        if: steps.check-pr.outputs.is_pull_request == 'true'
-        run: |
-          SUMMARY=$(./grafana/summary.sh)
-          echo "### Summary of Grafana Panels" >> $GITHUB_STEP_SUMMARY
-          echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1594,7 +1594,7 @@ dependencies = [
 "bitflags 1.3.2",
 "strsim 0.8.0",
 "textwrap 0.11.0",
- "unicode-width",
+ "unicode-width 0.1.14",
 "vec_map",
 ]

@@ -1876,7 +1876,7 @@ checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
 dependencies = [
 "strum 0.26.3",
 "strum_macros 0.26.4",
- "unicode-width",
+ "unicode-width 0.1.14",
 ]

 [[package]]
@@ -2469,6 +2469,7 @@ dependencies = [
 "encode_unicode",
 "lazy_static",
 "libc",
+ "unicode-width 0.1.14",
 "windows-sys 0.52.0",
 ]

@@ -4167,7 +4168,6 @@ dependencies = [
 "bytes",
 "cache",
 "catalog",
- "chrono",
 "client",
 "common-base",
 "common-catalog",
@@ -4646,7 +4646,7 @@ version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
 dependencies = [
- "unicode-width",
+ "unicode-width 0.1.14",
 ]

 [[package]]
@@ -5567,7 +5567,6 @@ dependencies = [
 "rand",
 "regex",
 "regex-automata 0.4.8",
- "roaring",
 "serde",
 "serde_json",
 "snafu 0.8.5",
@@ -5601,6 +5600,19 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "indicatif"
+version = "0.17.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
+dependencies = [
+ "console",
+ "number_prefix",
+ "portable-atomic",
+ "unicode-width 0.2.0",
+ "web-time 1.1.0",
+]
+
 [[package]]
 name = "inferno"
 version = "0.11.21"
@@ -5630,6 +5642,25 @@ dependencies = [
 "snafu 0.7.5",
 ]

+[[package]]
+name = "ingester"
+version = "0.13.0"
+dependencies = [
+ "clap 4.5.19",
+ "common-telemetry",
+ "common-time",
+ "datanode",
+ "meta-client",
+ "mito2",
+ "object-store",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "sst-convert",
+ "tokio",
+ "toml 0.8.19",
+]
+
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -5899,15 +5930,15 @@ dependencies = [

 [[package]]
 name = "jsonpath-rust"
-version = "0.7.5"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c00ae348f9f8fd2d09f82a98ca381c60df9e0820d8d79fce43e649b4dc3128b"
+checksum = "69a61b87f6a55cc6c28fed5739dd36b9642321ce63e4a5e4a4715d69106f4a10"
 dependencies = [
 "pest",
 "pest_derive",
 "regex",
 "serde_json",
- "thiserror 2.0.12",
+ "thiserror 1.0.64",
 ]

 [[package]]
@@ -7519,6 +7550,12 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "number_prefix"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
+
 [[package]]
 name = "objc"
 version = "0.2.7"
@@ -7975,7 +8012,7 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2ad9b889f1b12e0b9ee24db044b5129150d5eada288edc800f789928dc8c0e3"
 dependencies = [
- "unicode-width",
+ "unicode-width 0.1.14",
 ]

 [[package]]
@@ -8071,6 +8108,19 @@ dependencies = [
 "zstd-sys",
 ]

+[[package]]
+name = "parquet_opendal"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4140ae96f37c170f8d684a544711fabdac1d94adcbd97e8b033329bd37f40446"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures",
+ "opendal",
+ "parquet",
+]
+
 [[package]]
 name = "parse-zoneinfo"
 version = "0.3.1"
@@ -8272,7 +8322,7 @@ dependencies = [
 "rand",
 "ring",
 "rust_decimal",
- "thiserror 2.0.12",
+ "thiserror 2.0.6",
 "tokio",
 "tokio-rustls 0.26.0",
 "tokio-util",
@@ -8384,7 +8434,7 @@ dependencies = [
 "greptime-proto",
 "itertools 0.10.5",
 "jsonb",
- "jsonpath-rust 0.7.5",
+ "jsonpath-rust 0.7.3",
 "lazy_static",
 "moka",
 "once_cell",
@@ -8762,7 +8812,6 @@ dependencies = [
 "common-recordbatch",
 "common-telemetry",
 "datafusion",
- "datafusion-common",
 "datafusion-expr",
 "datatypes",
 "futures",
@@ -8776,9 +8825,8 @@ dependencies = [

 [[package]]
 name = "promql-parser"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c6b1429bdd199d53bd58b745075c1652efedbe2746e5d4f0d56d3184dda48ec"
+version = "0.4.3"
+source = "git+https://github.com/GreptimeTeam/promql-parser.git?rev=27abb8e16003a50c720f00d6c85f41f5fa2a2a8e#27abb8e16003a50c720f00d6c85f41f5fa2a2a8e"
 dependencies = [
 "cfgrammar",
 "chrono",
@@ -9636,16 +9684,6 @@ dependencies = [
 "syn 1.0.109",
 ]

-[[package]]
-name = "roaring"
-version = "0.10.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41589aba99537475bf697f2118357cad1c31590c5a1b9f6d9fc4ad6d07503661"
-dependencies = [
- "bytemuck",
- "byteorder",
-]
-
 [[package]]
 name = "robust"
 version = "1.1.0"
@@ -10070,7 +10108,7 @@ dependencies = [
 "radix_trie",
 "scopeguard",
 "unicode-segmentation",
- "unicode-width",
+ "unicode-width 0.1.14",
 "utf8parse",
 "winapi",
 ]
@@ -11065,7 +11103,7 @@ dependencies = [
 "serde_json",
 "sha2",
 "smallvec",
- "thiserror 2.0.12",
+ "thiserror 2.0.6",
 "tokio",
 "tokio-stream",
 "tracing",
@@ -11150,7 +11188,7 @@ dependencies = [
 "smallvec",
 "sqlx-core",
 "stringprep",
- "thiserror 2.0.12",
+ "thiserror 2.0.6",
 "tracing",
 "whoami",
 ]
@@ -11188,7 +11226,7 @@ dependencies = [
 "smallvec",
 "sqlx-core",
 "stringprep",
- "thiserror 2.0.12",
+ "thiserror 2.0.6",
 "tracing",
 "whoami",
 ]
@@ -11217,6 +11255,36 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "sst-convert"
+version = "0.13.0"
+dependencies = [
+ "api",
+ "arrow-array",
+ "async-trait",
+ "catalog",
+ "common-error",
+ "common-macro",
+ "common-meta",
+ "common-recordbatch",
+ "common-telemetry",
+ "datanode",
+ "datatypes",
+ "futures",
+ "futures-util",
+ "indicatif",
+ "meta-client",
+ "metric-engine",
+ "mito2",
+ "object-store",
+ "parquet",
+ "parquet_opendal",
+ "prost 0.13.3",
+ "snafu 0.8.5",
+ "store-api",
+ "table",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -11949,7 +12017,7 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
 dependencies = [
- "unicode-width",
+ "unicode-width 0.1.14",
 ]

 [[package]]
@@ -11969,11 +12037,11 @@ dependencies = [

 [[package]]
 name = "thiserror"
-version = "2.0.12"
+version = "2.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
+checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47"
 dependencies = [
- "thiserror-impl 2.0.12",
+ "thiserror-impl 2.0.6",
 ]

 [[package]]
@@ -11989,9 +12057,9 @@ dependencies = [

 [[package]]
 name = "thiserror-impl"
-version = "2.0.12"
+version = "2.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
+checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -13052,6 +13120,12 @@ version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"

+[[package]]
+name = "unicode-width"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
+
 [[package]]
 name = "unicode-xid"
 version = "0.2.6"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ members = [
    "src/flow",
    "src/frontend",
    "src/index",
+    "src/ingester",
    "src/log-query",
    "src/log-store",
    "src/meta-client",
@@ -58,6 +59,7 @@ members = [
    "src/servers",
    "src/session",
    "src/sql",
+    "src/sst-convert",
    "src/store-api",
    "src/table",
    "tests-fuzz",
@@ -160,7 +162,9 @@ parquet = { version = "53.0.0", default-features = false, features = ["arrow", "
 paste = "1.0"
 pin-project = "1.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { version = "0.5", features = ["ser"] }
+promql-parser = { git = "https://github.com/GreptimeTeam/promql-parser.git", features = [
+    "ser",
+], rev = "27abb8e16003a50c720f00d6c85f41f5fa2a2a8e" }
 prost = "0.13"
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.8"
@@ -269,6 +273,7 @@ query = { path = "src/query" }
 servers = { path = "src/servers" }
 session = { path = "src/session" }
 sql = { path = "src/sql" }
+sst-convert = { path = "src/sst-convert" }
 store-api = { path = "src/store-api" }
 substrait = { path = "src/common/substrait" }
 table = { path = "src/table" }
--- a/chore.md
+++ b/chore.md
@@ -0,0 +1,76 @@
+# log
+## first create table
+```bash
+mysql --host=127.0.0.1 --port=19195 --database=public;
+```
+
+```sql
+CREATE DATABASE IF NOT EXISTS `cluster1`;
+USE `cluster1`;
+CREATE TABLE IF NOT EXISTS `app1` (
+  `greptime_timestamp` TimestampNanosecond NOT NULL TIME INDEX,
+  `app` STRING NULL INVERTED INDEX,
+  `cluster` STRING NULL INVERTED INDEX,
+  `message` STRING NULL,
+  `region` STRING NULL,
+  `cloud-provider` STRING NULL,
+  `environment` STRING NULL,
+  `product` STRING NULL,
+  `sub-product` STRING NULL,
+  `service` STRING NULL
+) WITH (
+  append_mode = 'true',
+  'compaction.type' = 'twcs',                        
+  'compaction.twcs.max_output_file_size' = '500MB',  
+  'compaction.twcs.max_active_window_files' = '16',   
+  'compaction.twcs.max_active_window_runs' = '4',    
+  'compaction.twcs.max_inactive_window_files' = '4', 
+  'compaction.twcs.max_inactive_window_runs' = '2',  
+);
+
+select count(*) from app1;
+
+SELECT * FROM app1 ORDER BY greptime_timestamp DESC LIMIT 10\G
+```
+
+## then ingest
+```bash
+RUST_LOG="debug" cargo run --bin=ingester -- --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --parquet-dir="parquet_store/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
+```
+
+# metrics!!!!!!!
+```bash
+mysql --host=127.0.0.1 --port=19195 --database=public < public.greptime_physical_table-create-tables.sql
+```
+
+## then ingest
+```bash
+RUST_LOG="debug" 
+cargo run --bin=ingester -- --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --remote-write-dir="metrics_parquet/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
+# perf it
+cargo build --release ---bin=ingester
+samply record target/release/ingester --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --remote-write-dir="metrics_parquet/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
+```
+
+## check data
+```sql
+select count(*) from greptime_physical_table;
+----------+
+| count(*) |
+----------+
+|    36200 |
+----------+
+1 row in set (0.06 sec)
+
+select count(*) from storage_operation_errors_total;
+----------+
+| count(*) |
+----------+
+|       10 |
+----------+
+1 row in set (0.03 sec)
+```
+
+
+# with oss
+the same, only different is change storage config in `ingester.toml`
--- a/grafana/check.sh
+++ b/grafana/check.sh
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-
-BASEDIR=$(dirname "$0")
-
-# Use jq to check for panels with empty or missing descriptions
-invalid_panels=$(cat $BASEDIR/greptimedb-cluster.json | jq -r '
-  .panels[]
-  | select((.type == "stats" or .type == "timeseries") and (.description == "" or .description == null))
-')
-
-# Check if any invalid panels were found
-if [[ -n "$invalid_panels" ]]; then
-  echo "Error: The following panels have empty or missing descriptions:"
-  echo "$invalid_panels"
-  exit 1
-else
-  echo "All panels with type 'stats' or 'timeseries' have valid descriptions."
-  exit 0
-fi
--- a/grafana/greptimedb-cluster.json
+++ b/grafana/greptimedb-cluster.json
--- a/grafana/summary.sh
+++ b/grafana/summary.sh
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-
-BASEDIR=$(dirname "$0")
-echo '| Title | Description | Expressions |
-|---|---|---|'
-
-cat $BASEDIR/greptimedb-cluster.json | jq -r '
-  .panels |
-  map(select(.type == "stat" or .type == "timeseries")) |
-  .[] | "| \(.title) | \(.description | gsub("\n"; "<br>")) | \(.targets | map(.expr // .rawSql | "`\(.|gsub("\n"; "<br>"))`")  | join("<br>")) |"
-'
--- a/ingester.toml
+++ b/ingester.toml
@@ -0,0 +1,35 @@
+## The metasrv client options.
+[meta_client]
+## The addresses of the metasrv.
+metasrv_addrs = ["127.0.0.1:3002", "127.0.0.1:3003"]
+
+## Operation timeout.
+timeout = "3s"
+
+## Heartbeat timeout.
+heartbeat_timeout = "500ms"
+
+## DDL timeout.
+ddl_timeout = "10s"
+
+## Connect server timeout.
+connect_timeout = "1s"
+
+## `TCP_NODELAY` option for accepted connections.
+tcp_nodelay = true
+
+## The configuration about the cache of the metadata.
+metadata_cache_max_capacity = 100000
+
+## TTL of the metadata cache.
+metadata_cache_ttl = "10m"
+
+# TTI of the metadata cache.
+metadata_cache_tti = "5m"
+
+## The data storage options.
+[storage]
+## The working home directory.
+data_home = "/tmp/greptimedb-cluster/datanode0"
+type = "File"
+[mito]
--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -16,6 +16,7 @@

 mod client;
 pub mod client_manager;
+#[cfg(feature = "testing")]
 mod database;
 pub mod error;
 pub mod flow;
@@ -33,6 +34,7 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
 use snafu::OptionExt;

 pub use self::client::Client;
+#[cfg(feature = "testing")]
 pub use self::database::Database;
 pub use self::error::{Error, Result};
 use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -32,7 +32,7 @@ use common_meta::key::TableMetadataManager;
 use common_telemetry::info;
 use common_telemetry::logging::TracingOptions;
 use common_version::{short_version, version};
-use flow::{FlownodeBuilder, FlownodeInstance, FrontendClient, FrontendInvoker};
+use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
 use meta_client::{MetaClientOptions, MetaClientType};
 use servers::Mode;
 use snafu::{OptionExt, ResultExt};
@@ -311,8 +311,6 @@ impl StartCommand {
            Arc::new(executor),
        );

-        let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
-
        let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
        let flownode_builder = FlownodeBuilder::new(
            opts,
@@ -320,7 +318,6 @@ impl StartCommand {
            table_metadata_manager,
            catalog_manager.clone(),
            flow_metadata_manager,
-            Arc::new(frontend_client),
        )
        .with_heartbeat_task(heartbeat_task);

--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -54,10 +54,7 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
 use datanode::datanode::{Datanode, DatanodeBuilder};
 use datanode::region_server::RegionServer;
 use file_engine::config::EngineConfig as FileEngineConfig;
-use flow::{
-    FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendClient,
-    FrontendInvoker,
-};
+use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
 use frontend::frontend::FrontendOptions;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
@@ -536,16 +533,12 @@ impl StartCommand {
            flow: opts.flow.clone(),
            ..Default::default()
        };
-
-        let fe_server_addr = fe_opts.grpc.bind_addr.clone();
-        let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
        let flow_builder = FlownodeBuilder::new(
            flownode_options,
            plugins.clone(),
            table_metadata_manager.clone(),
            catalog_manager.clone(),
            flow_metadata_manager.clone(),
-            Arc::new(frontend_client),
        );
        let flownode = Arc::new(
            flow_builder
--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -445,20 +445,10 @@ impl Pool {

 async fn recycle_channel_in_loop(pool: Arc<Pool>, interval_secs: u64) {
    let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
-    // use weak ref here to prevent pool being leaked
-    let pool_weak = {
-        let weak = Arc::downgrade(&pool);
-        drop(pool);
-        weak
-    };
+
    loop {
        let _ = interval.tick().await;
-        if let Some(pool) = pool_weak.upgrade() {
-            pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
-        } else {
-            // no one is using this pool, so we can also let go
-            break;
-        }
+        pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
    }
 }

--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -337,7 +337,6 @@ pub enum FlowType {
 impl FlowType {
    pub const RECORDING_RULE: &str = "recording_rule";
    pub const STREAMING: &str = "streaming";
-    pub const FLOW_TYPE_KEY: &str = "flow_type";
 }

 impl Default for FlowType {
@@ -392,8 +391,7 @@ impl From<&CreateFlowData> for CreateRequest {
        };

        let flow_type = value.flow_type.unwrap_or_default().to_string();
-        req.flow_options
-            .insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
+        req.flow_options.insert("flow_type".to_string(), flow_type);
        req
    }
 }
@@ -425,7 +423,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            .collect::<Vec<_>>();

        let flow_type = value.flow_type.unwrap_or_default().to_string();
-        options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
+        options.insert("flow_type".to_string(), flow_type);

        let flow_info = FlowInfoValue {
            source_table_ids: value.source_table_ids.clone(),
--- a/src/datanode/src/lib.rs
+++ b/src/datanode/src/lib.rs
@@ -25,6 +25,6 @@ pub mod heartbeat;
 pub mod metrics;
 pub mod region_server;
 pub mod service;
-mod store;
+pub mod store;
 #[cfg(any(test, feature = "testing"))]
 pub mod tests;
--- a/src/datanode/src/store.rs
+++ b/src/datanode/src/store.rs
@@ -15,7 +15,7 @@
 //! object storage utilities

 mod azblob;
-mod fs;
+pub mod fs;
 mod gcs;
 mod oss;
 mod s3;
--- a/src/datanode/src/store/fs.rs
+++ b/src/datanode/src/store/fs.rs
@@ -24,7 +24,8 @@ use crate::config::FileConfig;
 use crate::error::{self, Result};
 use crate::store;

-pub(crate) async fn new_fs_object_store(
+/// A helper function to create a file system object store.
+pub async fn new_fs_object_store(
    data_home: &str,
    _file_config: &FileConfig,
 ) -> Result<ObjectStore> {
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -16,7 +16,6 @@ async-trait.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
-chrono.workspace = true
 client.workspace = true
 common-base.workspace = true
 common-config.workspace = true
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -49,13 +49,12 @@ pub(crate) use crate::adapter::node_context::FlownodeContext;
 use crate::adapter::refill::RefillTask;
 use crate::adapter::table_source::ManagedTableSource;
 use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
-pub(crate) use crate::adapter::worker::{create_worker, WorkerHandle};
+pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
 use crate::compute::ErrCollector;
 use crate::df_optimizer::sql_to_flow_plan;
 use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
 use crate::expr::Batch;
 use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
-use crate::recording_rules::RecordingRuleEngine;
 use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};

 mod flownode_impl;
@@ -64,7 +63,7 @@ pub(crate) mod refill;
 mod stat;
 #[cfg(test)]
 mod tests;
-pub(crate) mod util;
+mod util;
 mod worker;

 pub(crate) mod node_context;
@@ -170,8 +169,6 @@ pub struct FlowWorkerManager {
    flush_lock: RwLock<()>,
    /// receive a oneshot sender to send state size report
    state_report_handler: RwLock<Option<StateReportHandler>>,
-    /// engine for recording rule
-    rule_engine: RecordingRuleEngine,
 }

 /// Building FlownodeManager
@@ -186,7 +183,6 @@ impl FlowWorkerManager {
        node_id: Option<u32>,
        query_engine: Arc<dyn QueryEngine>,
        table_meta: TableMetadataManagerRef,
-        rule_engine: RecordingRuleEngine,
    ) -> Self {
        let srv_map = ManagedTableSource::new(
            table_meta.table_info_manager().clone(),
@@ -209,7 +205,6 @@ impl FlowWorkerManager {
            node_id,
            flush_lock: RwLock::new(()),
            state_report_handler: RwLock::new(None),
-            rule_engine,
        }
    }

@@ -218,6 +213,25 @@ impl FlowWorkerManager {
        self
    }

+    /// Create a flownode manager with one worker
+    pub fn new_with_workers<'s>(
+        node_id: Option<u32>,
+        query_engine: Arc<dyn QueryEngine>,
+        table_meta: TableMetadataManagerRef,
+        num_workers: usize,
+    ) -> (Self, Vec<Worker<'s>>) {
+        let mut zelf = Self::new(node_id, query_engine, table_meta);
+
+        let workers: Vec<_> = (0..num_workers)
+            .map(|_| {
+                let (handle, worker) = create_worker();
+                zelf.add_worker_handle(handle);
+                worker
+            })
+            .collect();
+        (zelf, workers)
+    }
+
    /// add a worker handler to manager, meaning this corresponding worker is under it's manage
    pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
        self.worker_handles.push(handle);
@@ -735,11 +749,7 @@ pub struct CreateFlowArgs {
 /// Create&Remove flow
 impl FlowWorkerManager {
    /// remove a flow by it's id
-    #[allow(unreachable_code)]
    pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
-        // TODO(discord9): reroute some back to streaming engine later
-        return self.rule_engine.remove_flow(flow_id).await;
-
        for handle in self.worker_handles.iter() {
            if handle.contains_flow(flow_id).await? {
                handle.remove_flow(flow_id).await?;
@@ -755,10 +765,8 @@ impl FlowWorkerManager {
    /// steps to create task:
    /// 1. parse query into typed plan(and optional parse expire_after expr)
    /// 2. render source/sink with output table id and used input table id
-    #[allow(clippy::too_many_arguments, unreachable_code)]
+    #[allow(clippy::too_many_arguments)]
    pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
-        // TODO(discord9): reroute some back to streaming engine later
-        return self.rule_engine.create_flow(args).await;
        let CreateFlowArgs {
            flow_id,
            sink_table_name,
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -153,13 +153,7 @@ impl Flownode for FlowWorkerManager {
        }
    }

-    #[allow(unreachable_code, unused)]
    async fn handle_inserts(&self, request: InsertRequests) -> Result<FlowResponse> {
-        return self
-            .rule_engine
-            .handle_inserts(request)
-            .await
-            .map_err(to_meta_err(snafu::location!()));
        // using try_read to ensure two things:
        // 1. flush wouldn't happen until inserts before it is inserted
        // 2. inserts happening concurrently with flush wouldn't be block by flush
@@ -212,15 +206,15 @@ impl Flownode for FlowWorkerManager {
                    .collect_vec();
                let table_col_names = table_schema.relation_desc.names;
                let table_col_names = table_col_names
-                        .iter().enumerate()
-                        .map(|(idx,name)| match name {
-                            Some(name) => Ok(name.clone()),
-                            None => InternalSnafu {
-                                reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
-                            }
-                            .fail().map_err(BoxedError::new).context(ExternalSnafu),
-                        })
-                        .collect::<Result<Vec<_>>>()?;
+                    .iter().enumerate()
+                    .map(|(idx,name)| match name {
+                        Some(name) => Ok(name.clone()),
+                        None => InternalSnafu {
+                            reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
+                        }
+                        .fail().map_err(BoxedError::new).context(ExternalSnafu),
+                    })
+                    .collect::<Result<Vec<_>>>()?;
                let name_to_col = HashMap::<_, _>::from_iter(
                    insert_schema
                        .iter()
--- a/src/flow/src/adapter/util.rs
+++ b/src/flow/src/adapter/util.rs
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-//! Some utility functions
-
 use std::sync::Arc;

 use api::helper::ColumnDataTypeWrapper;
--- a/src/flow/src/error.rs
+++ b/src/flow/src/error.rs
@@ -16,7 +16,6 @@

 use std::any::Any;

-use arrow_schema::ArrowError;
 use common_error::ext::BoxedError;
 use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
 use common_macro::stack_trace_debug;
@@ -54,13 +53,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Time error"))]
-    Time {
-        source: common_time::error::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("External error"))]
    External {
        source: BoxedError,
@@ -164,15 +156,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Arrow error: {raw:?} in context: {context}"))]
-    Arrow {
-        #[snafu(source)]
-        raw: ArrowError,
-        context: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
    Datafusion {
        #[snafu(source)]
@@ -247,7 +230,6 @@ impl ErrorExt for Error {
        match self {
            Self::Eval { .. }
            | Self::JoinTask { .. }
-            | Self::Arrow { .. }
            | Self::Datafusion { .. }
            | Self::InsertIntoFlow { .. } => StatusCode::Internal,
            Self::FlowAlreadyExist { .. } => StatusCode::TableAlreadyExists,
@@ -256,9 +238,7 @@ impl ErrorExt for Error {
            | Self::FlowNotFound { .. }
            | Self::ListFlows { .. } => StatusCode::TableNotFound,
            Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
-            Self::InvalidQuery { .. } | Self::CreateFlow { .. } | Self::Time { .. } => {
-                StatusCode::EngineExecuteQuery
-            }
+            Self::InvalidQuery { .. } | Self::CreateFlow { .. } => StatusCode::EngineExecuteQuery,
            Self::Unexpected { .. } => StatusCode::Unexpected,
            Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
                StatusCode::Unsupported
--- a/src/flow/src/expr/utils.rs
+++ b/src/flow/src/expr/utils.rs
@@ -238,7 +238,6 @@ mod test {

        for (sql, current, expected) in &testcases {
            let plan = sql_to_substrait(engine.clone(), sql).await;
-
            let mut ctx = create_test_ctx();
            let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
                .await
--- a/src/flow/src/heartbeat.rs
+++ b/src/flow/src/heartbeat.rs
@@ -130,6 +130,13 @@ impl HeartbeatTask {

    pub fn shutdown(&self) {
        info!("Close heartbeat task for flownode");
+        if self
+            .running
+            .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
+            .is_err()
+        {
+            warn!("Call close heartbeat task multiple times");
+        }
    }

    fn new_heartbeat_request(
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -33,7 +33,6 @@ mod expr;
 pub mod heartbeat;
 mod metrics;
 mod plan;
-mod recording_rules;
 mod repr;
 mod server;
 mod transform;
@@ -44,5 +43,4 @@ mod test_utils;

 pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
 pub use error::{Error, Result};
-pub use recording_rules::FrontendClient;
 pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};
--- a/src/flow/src/metrics.rs
+++ b/src/flow/src/metrics.rs
@@ -28,32 +28,6 @@ lazy_static! {
        &["table_id"]
    )
    .unwrap();
-    pub static ref METRIC_FLOW_RULE_ENGINE_QUERY_TIME: HistogramVec = register_histogram_vec!(
-        "greptime_flow_rule_engine_query_time",
-        "flow rule engine query time",
-        &["flow_id"],
-        vec![
-            0.0,
-            1.,
-            3.,
-            5.,
-            10.,
-            20.,
-            30.,
-            60.,
-            2. * 60.,
-            5. * 60.,
-            10. * 60.
-        ]
-    )
-    .unwrap();
-    pub static ref METRIC_FLOW_RULE_ENGINE_SLOW_QUERY: HistogramVec = register_histogram_vec!(
-        "greptime_flow_rule_engine_slow_query",
-        "flow rule engine slow query",
-        &["flow_id", "sql", "peer"],
-        vec![60., 2. * 60., 3. * 60., 5. * 60., 10. * 60.]
-    )
-    .unwrap();
    pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
        register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
    pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
--- a/src/flow/src/recording_rules.rs
+++ b/src/flow/src/recording_rules.rs
@@ -1,940 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Run flow as recording rule which is time-window-aware normal query triggered every tick set by user
-
-mod engine;
-mod frontend_client;
-
-use std::collections::BTreeSet;
-use std::sync::Arc;
-
-use api::helper::pb_value_to_value_ref;
-use catalog::CatalogManagerRef;
-use common_error::ext::BoxedError;
-use common_recordbatch::DfRecordBatch;
-use common_telemetry::warn;
-use common_time::timestamp::TimeUnit;
-use common_time::Timestamp;
-use datafusion::error::Result as DfResult;
-use datafusion::logical_expr::Expr;
-use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
-use datafusion::prelude::SessionContext;
-use datafusion::sql::unparser::Unparser;
-use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter};
-use datafusion_common::{DFSchema, TableReference};
-use datafusion_expr::{ColumnarValue, LogicalPlan};
-use datafusion_physical_expr::PhysicalExprRef;
-use datatypes::prelude::{ConcreteDataType, DataType};
-use datatypes::scalars::ScalarVector;
-use datatypes::schema::TIME_INDEX_KEY;
-use datatypes::value::Value;
-use datatypes::vectors::{
-    TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
-    TimestampSecondVector, Vector,
-};
-pub use engine::RecordingRuleEngine;
-pub use frontend_client::FrontendClient;
-use itertools::Itertools;
-use query::parser::QueryLanguageParser;
-use query::QueryEngineRef;
-use session::context::QueryContextRef;
-use snafu::{ensure, OptionExt, ResultExt};
-
-use crate::adapter::util::from_proto_to_data_type;
-use crate::df_optimizer::apply_df_optimizer;
-use crate::error::{ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, UnexpectedSnafu};
-use crate::expr::error::DataTypeSnafu;
-use crate::Error;
-
-#[derive(Debug, Clone)]
-pub struct TimeWindowExpr {
-    phy_expr: PhysicalExprRef,
-    column_name: String,
-    logical_expr: Expr,
-    df_schema: DFSchema,
-}
-
-impl TimeWindowExpr {
-    pub fn from_expr(expr: &Expr, column_name: &str, df_schema: &DFSchema) -> Result<Self, Error> {
-        let phy_planner = DefaultPhysicalPlanner::default();
-
-        let phy_expr: PhysicalExprRef = phy_planner
-            .create_physical_expr(expr, df_schema, &SessionContext::new().state())
-            .with_context(|_e| DatafusionSnafu {
-                context: format!(
-                    "Failed to create physical expression from {expr:?} using {df_schema:?}"
-                ),
-            })?;
-        Ok(Self {
-            phy_expr,
-            column_name: column_name.to_string(),
-            logical_expr: expr.clone(),
-            df_schema: df_schema.clone(),
-        })
-    }
-
-    pub fn eval(
-        &self,
-        current: Timestamp,
-    ) -> Result<(Option<Timestamp>, Option<Timestamp>), Error> {
-        let lower_bound =
-            find_expr_time_window_lower_bound(&self.logical_expr, &self.df_schema, current)?;
-        let upper_bound =
-            find_expr_time_window_upper_bound(&self.logical_expr, &self.df_schema, current)?;
-        Ok((lower_bound, upper_bound))
-    }
-
-    /// Find timestamps from rows using time window expr
-    pub async fn handle_rows(
-        &self,
-        rows_list: Vec<api::v1::Rows>,
-    ) -> Result<BTreeSet<Timestamp>, Error> {
-        let mut time_windows = BTreeSet::new();
-
-        for rows in rows_list {
-            // pick the time index column and use it to eval on `self.expr`
-            let ts_col_index = rows
-                .schema
-                .iter()
-                .map(|col| col.column_name.clone())
-                .position(|name| name == self.column_name);
-            let Some(ts_col_index) = ts_col_index else {
-                warn!("can't found time index column in schema: {:?}", rows.schema);
-                continue;
-            };
-            let col_schema = &rows.schema[ts_col_index];
-            let cdt = from_proto_to_data_type(col_schema)?;
-
-            let column_values = rows
-                .rows
-                .iter()
-                .map(|row| &row.values[ts_col_index])
-                .collect_vec();
-
-            let mut vector = cdt.create_mutable_vector(column_values.len());
-            for value in column_values {
-                let value = pb_value_to_value_ref(value, &None);
-                vector.try_push_value_ref(value).context(DataTypeSnafu {
-                    msg: "Failed to convert rows to columns",
-                })?;
-            }
-            let vector = vector.to_vector();
-
-            let df_schema = create_df_schema_for_ts_column(&self.column_name, cdt)?;
-
-            let rb =
-                DfRecordBatch::try_new(df_schema.inner().clone(), vec![vector.to_arrow_array()])
-                    .with_context(|_e| ArrowSnafu {
-                        context: format!(
-                            "Failed to create record batch from {df_schema:?} and {vector:?}"
-                        ),
-                    })?;
-
-            let eval_res = self
-                .phy_expr
-                .evaluate(&rb)
-                .with_context(|_| DatafusionSnafu {
-                    context: format!(
-                        "Failed to evaluate physical expression {:?} on {rb:?}",
-                        self.phy_expr
-                    ),
-                })?;
-
-            let res = columnar_to_ts_vector(&eval_res)?;
-
-            for ts in res.into_iter().flatten() {
-                time_windows.insert(ts);
-            }
-        }
-
-        Ok(time_windows)
-    }
-}
-
-fn create_df_schema_for_ts_column(name: &str, cdt: ConcreteDataType) -> Result<DFSchema, Error> {
-    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
-        name,
-        cdt.as_arrow_type(),
-        false,
-    )]));
-
-    let df_schema = DFSchema::from_field_specific_qualified_schema(
-        vec![Some(TableReference::bare("TimeIndexOnlyTable"))],
-        &arrow_schema,
-    )
-    .with_context(|_e| DatafusionSnafu {
-        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
-    })?;
-
-    Ok(df_schema)
-}
-
-/// Convert `ColumnarValue` to `Vec<Option<Timestamp>>`
-fn columnar_to_ts_vector(columnar: &ColumnarValue) -> Result<Vec<Option<Timestamp>>, Error> {
-    let val = match columnar {
-        datafusion_expr::ColumnarValue::Array(array) => {
-            let ty = array.data_type();
-            let ty = ConcreteDataType::from_arrow_type(ty);
-            let time_unit = if let ConcreteDataType::Timestamp(ty) = ty {
-                ty.unit()
-            } else {
-                return UnexpectedSnafu {
-                    reason: format!("Non-timestamp type: {ty:?}"),
-                }
-                .fail();
-            };
-
-            match time_unit {
-                TimeUnit::Second => TimestampSecondVector::try_from_arrow_array(array.clone())
-                    .with_context(|_| DatatypesSnafu {
-                        extra: format!("Failed to create vector from arrow array {array:?}"),
-                    })?
-                    .iter_data()
-                    .map(|d| d.map(|d| d.0))
-                    .collect_vec(),
-                TimeUnit::Millisecond => {
-                    TimestampMillisecondVector::try_from_arrow_array(array.clone())
-                        .with_context(|_| DatatypesSnafu {
-                            extra: format!("Failed to create vector from arrow array {array:?}"),
-                        })?
-                        .iter_data()
-                        .map(|d| d.map(|d| d.0))
-                        .collect_vec()
-                }
-                TimeUnit::Microsecond => {
-                    TimestampMicrosecondVector::try_from_arrow_array(array.clone())
-                        .with_context(|_| DatatypesSnafu {
-                            extra: format!("Failed to create vector from arrow array {array:?}"),
-                        })?
-                        .iter_data()
-                        .map(|d| d.map(|d| d.0))
-                        .collect_vec()
-                }
-                TimeUnit::Nanosecond => {
-                    TimestampNanosecondVector::try_from_arrow_array(array.clone())
-                        .with_context(|_| DatatypesSnafu {
-                            extra: format!("Failed to create vector from arrow array {array:?}"),
-                        })?
-                        .iter_data()
-                        .map(|d| d.map(|d| d.0))
-                        .collect_vec()
-                }
-            }
-        }
-        datafusion_expr::ColumnarValue::Scalar(scalar) => {
-            let value = Value::try_from(scalar.clone()).with_context(|_| DatatypesSnafu {
-                extra: format!("Failed to convert scalar {scalar:?} to value"),
-            })?;
-            let ts = value.as_timestamp().context(UnexpectedSnafu {
-                reason: format!("Expect Timestamp, found {:?}", value),
-            })?;
-            vec![Some(ts)]
-        }
-    };
-    Ok(val)
-}
-
-/// Convert sql to datafusion logical plan
-pub async fn sql_to_df_plan(
-    query_ctx: QueryContextRef,
-    engine: QueryEngineRef,
-    sql: &str,
-    optimize: bool,
-) -> Result<LogicalPlan, Error> {
-    let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
-        .map_err(BoxedError::new)
-        .context(ExternalSnafu)?;
-    let plan = engine
-        .planner()
-        .plan(&stmt, query_ctx)
-        .await
-        .map_err(BoxedError::new)
-        .context(ExternalSnafu)?;
-    let plan = if optimize {
-        apply_df_optimizer(plan).await?
-    } else {
-        plan
-    };
-    Ok(plan)
-}
-
-/// Return (the column name of time index column, the time window expr, the expected time unit of time index column, the expr's schema for evaluating the time window)
-async fn find_time_window_expr(
-    plan: &LogicalPlan,
-    catalog_man: CatalogManagerRef,
-    query_ctx: QueryContextRef,
-) -> Result<(String, Option<datafusion_expr::Expr>, TimeUnit, DFSchema), Error> {
-    // TODO(discord9): find the expr that do time window
-
-    let mut table_name = None;
-
-    // first find the table source in the logical plan
-    plan.apply(|plan| {
-        let LogicalPlan::TableScan(table_scan) = plan else {
-            return Ok(TreeNodeRecursion::Continue);
-        };
-        table_name = Some(table_scan.table_name.clone());
-        Ok(TreeNodeRecursion::Stop)
-    })
-    .with_context(|_| DatafusionSnafu {
-        context: format!("Can't find table source in plan {plan:?}"),
-    })?;
-    let Some(table_name) = table_name else {
-        UnexpectedSnafu {
-            reason: format!("Can't find table source in plan {plan:?}"),
-        }
-        .fail()?
-    };
-
-    let current_schema = query_ctx.current_schema();
-
-    let catalog_name = table_name.catalog().unwrap_or(query_ctx.current_catalog());
-    let schema_name = table_name.schema().unwrap_or(&current_schema);
-    let table_name = table_name.table();
-
-    let Some(table_ref) = catalog_man
-        .table(catalog_name, schema_name, table_name, Some(&query_ctx))
-        .await
-        .map_err(BoxedError::new)
-        .context(ExternalSnafu)?
-    else {
-        UnexpectedSnafu {
-            reason: format!(
-                "Can't find table {table_name:?} in catalog {catalog_name:?}/{schema_name:?}"
-            ),
-        }
-        .fail()?
-    };
-
-    let schema = &table_ref.table_info().meta.schema;
-
-    let ts_index = schema.timestamp_column().context(UnexpectedSnafu {
-        reason: format!("Can't find timestamp column in table {table_name:?}"),
-    })?;
-
-    let ts_col_name = ts_index.name.clone();
-
-    let expected_time_unit = ts_index.data_type.as_timestamp().with_context(|| UnexpectedSnafu {
-        reason: format!(
-            "Expected timestamp column {ts_col_name:?} in table {table_name:?} to be timestamp, but got {ts_index:?}"
-        ),
-    })?.unit();
-
-    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
-        ts_col_name.clone(),
-        ts_index.data_type.as_arrow_type(),
-        false,
-    )]));
-
-    let df_schema = DFSchema::from_field_specific_qualified_schema(
-        vec![Some(TableReference::bare(table_name))],
-        &arrow_schema,
-    )
-    .with_context(|_e| DatafusionSnafu {
-        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
-    })?;
-
-    // find the time window expr which refers to the time index column
-    let mut aggr_expr = None;
-    let mut time_window_expr: Option<Expr> = None;
-
-    let find_inner_aggr_expr = |plan: &LogicalPlan| {
-        if let LogicalPlan::Aggregate(aggregate) = plan {
-            aggr_expr = Some(aggregate.clone());
-        };
-
-        Ok(TreeNodeRecursion::Continue)
-    };
-    plan.apply(find_inner_aggr_expr)
-        .with_context(|_| DatafusionSnafu {
-            context: format!("Can't find aggr expr in plan {plan:?}"),
-        })?;
-
-    if let Some(aggregate) = aggr_expr {
-        for group_expr in &aggregate.group_expr {
-            let refs = group_expr.column_refs();
-            if refs.len() != 1 {
-                continue;
-            }
-            let ref_col = refs.iter().next().unwrap();
-
-            let index = aggregate.input.schema().maybe_index_of_column(ref_col);
-            let Some(index) = index else {
-                continue;
-            };
-            let field = aggregate.input.schema().field(index);
-
-            let is_time_index = field.metadata().get(TIME_INDEX_KEY) == Some(&"true".to_string());
-
-            if is_time_index {
-                let rewrite_column = group_expr.clone();
-                let rewritten = rewrite_column
-                    .rewrite(&mut RewriteColumn {
-                        table_name: table_name.to_string(),
-                    })
-                    .with_context(|_| DatafusionSnafu {
-                        context: format!("Rewrite expr failed, expr={:?}", group_expr),
-                    })?
-                    .data;
-                struct RewriteColumn {
-                    table_name: String,
-                }
-
-                impl TreeNodeRewriter for RewriteColumn {
-                    type Node = Expr;
-                    fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
-                        let Expr::Column(mut column) = node else {
-                            return Ok(Transformed::no(node));
-                        };
-
-                        column.relation = Some(TableReference::bare(self.table_name.clone()));
-
-                        Ok(Transformed::yes(Expr::Column(column)))
-                    }
-                }
-
-                time_window_expr = Some(rewritten);
-                break;
-            }
-        }
-        Ok((ts_col_name, time_window_expr, expected_time_unit, df_schema))
-    } else {
-        // can't found time window expr, return None
-        Ok((ts_col_name, None, expected_time_unit, df_schema))
-    }
-}
-
-/// Find nearest lower bound for time `current` in given `plan` for the time window expr.
-/// i.e. for time window expr being `date_bin(INTERVAL '5 minutes', ts) as time_window` and `current="2021-07-01 00:01:01.000"`,
-/// return `Some("2021-07-01 00:00:00.000")`
-/// if `plan` doesn't contain a `TIME INDEX` column, return `None`
-///
-/// Time window expr is a expr that:
-/// 1. ref only to a time index column
-/// 2. is monotonic increasing
-/// 3. show up in GROUP BY clause
-///
-/// note this plan should only contain one TableScan
-pub async fn find_plan_time_window_bound(
-    plan: &LogicalPlan,
-    current: Timestamp,
-    query_ctx: QueryContextRef,
-    engine: QueryEngineRef,
-) -> Result<(String, Option<Timestamp>, Option<Timestamp>), Error> {
-    // TODO(discord9): find the expr that do time window
-    let catalog_man = engine.engine_state().catalog_manager();
-
-    let (ts_col_name, time_window_expr, expected_time_unit, df_schema) =
-        find_time_window_expr(plan, catalog_man.clone(), query_ctx).await?;
-    // cast current to ts_index's type
-    let new_current = current
-        .convert_to(expected_time_unit)
-        .with_context(|| UnexpectedSnafu {
-            reason: format!("Failed to cast current timestamp {current:?} to {expected_time_unit}"),
-        })?;
-
-    // if no time_window_expr is found, return None
-    if let Some(time_window_expr) = time_window_expr {
-        let lower_bound =
-            find_expr_time_window_lower_bound(&time_window_expr, &df_schema, new_current)?;
-        let upper_bound =
-            find_expr_time_window_upper_bound(&time_window_expr, &df_schema, new_current)?;
-        Ok((ts_col_name, lower_bound, upper_bound))
-    } else {
-        Ok((ts_col_name, None, None))
-    }
-}
-
-/// Find the lower bound of time window in given `expr` and `current` timestamp.
-///
-/// i.e. for `current="2021-07-01 00:01:01.000"` and `expr=date_bin(INTERVAL '5 minutes', ts) as time_window` and `ts_col=ts`,
-/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
-/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
-/// of current time window given the current timestamp
-///
-/// if return None, meaning this time window have no lower bound
-fn find_expr_time_window_lower_bound(
-    expr: &Expr,
-    df_schema: &DFSchema,
-    current: Timestamp,
-) -> Result<Option<Timestamp>, Error> {
-    let phy_planner = DefaultPhysicalPlanner::default();
-
-    let phy_expr: PhysicalExprRef = phy_planner
-        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
-        .with_context(|_e| DatafusionSnafu {
-            context: format!(
-                "Failed to create physical expression from {expr:?} using {df_schema:?}"
-            ),
-        })?;
-
-    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
-    let input_time_unit = cur_time_window.unit();
-    Ok(cur_time_window.convert_to(input_time_unit))
-}
-
-/// Find the upper bound for time window expression
-fn find_expr_time_window_upper_bound(
-    expr: &Expr,
-    df_schema: &DFSchema,
-    current: Timestamp,
-) -> Result<Option<Timestamp>, Error> {
-    use std::cmp::Ordering;
-
-    let phy_planner = DefaultPhysicalPlanner::default();
-
-    let phy_expr: PhysicalExprRef = phy_planner
-        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
-        .with_context(|_e| DatafusionSnafu {
-            context: format!(
-                "Failed to create physical expression from {expr:?} using {df_schema:?}"
-            ),
-        })?;
-
-    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
-
-    // search to find the lower bound
-    let mut offset: i64 = 1;
-    let mut lower_bound = Some(current);
-    let upper_bound;
-    // first expontial probe to found a range for binary search
-    loop {
-        let Some(next_val) = current.value().checked_add(offset) else {
-            // no upper bound if overflow
-            return Ok(None);
-        };
-
-        let next_time_probe = common_time::Timestamp::new(next_val, current.unit());
-
-        let next_time_window = eval_ts_to_ts(&phy_expr, df_schema, next_time_probe)?;
-
-        match next_time_window.cmp(&cur_time_window) {
-            Ordering::Less => {UnexpectedSnafu {
-                reason: format!(
-                    "Unsupported time window expression, expect monotonic increasing for time window expression {expr:?}"
-                ),
-            }
-            .fail()?
-            }
-            Ordering::Equal => {
-                lower_bound = Some(next_time_probe);
-            }
-            Ordering::Greater => {
-                upper_bound = Some(next_time_probe);
-                break
-            }
-        }
-
-        let Some(new_offset) = offset.checked_mul(2) else {
-            // no upper bound if overflow
-            return Ok(None);
-        };
-        offset = new_offset;
-    }
-
-    // binary search for the exact upper bound
-
-    ensure!(lower_bound.map(|v|v.unit())==upper_bound.map(|v|v.unit()), UnexpectedSnafu{
-        reason: format!(" unit mismatch for time window expression {expr:?}, found {lower_bound:?} and {upper_bound:?}"),
-    });
-
-    let output_unit = upper_bound
-        .context(UnexpectedSnafu {
-            reason: "should have lower bound",
-        })?
-        .unit();
-
-    let mut low = lower_bound
-        .context(UnexpectedSnafu {
-            reason: "should have lower bound",
-        })?
-        .value();
-    let mut high = upper_bound
-        .context(UnexpectedSnafu {
-            reason: "should have upper bound",
-        })?
-        .value();
-    while low < high {
-        let mid = (low + high) / 2;
-        let mid_probe = common_time::Timestamp::new(mid, output_unit);
-        let mid_time_window = eval_ts_to_ts(&phy_expr, df_schema, mid_probe)?;
-
-        match mid_time_window.cmp(&cur_time_window) {
-            Ordering::Less => UnexpectedSnafu {
-                reason: format!("Binary search failed for time window expression {expr:?}"),
-            }
-            .fail()?,
-            Ordering::Equal => low = mid + 1,
-            Ordering::Greater => high = mid,
-        }
-    }
-
-    let final_upper_bound_for_time_window = common_time::Timestamp::new(high, output_unit);
-
-    Ok(Some(final_upper_bound_for_time_window))
-}
-
-fn eval_ts_to_ts(
-    phy: &PhysicalExprRef,
-    df_schema: &DFSchema,
-    input_value: Timestamp,
-) -> Result<Timestamp, Error> {
-    let schema_ty = df_schema.field(0).data_type();
-    let schema_cdt = ConcreteDataType::from_arrow_type(schema_ty);
-    let schema_unit = if let ConcreteDataType::Timestamp(ts) = schema_cdt {
-        ts.unit()
-    } else {
-        return UnexpectedSnafu {
-            reason: format!("Expect Timestamp, found {:?}", schema_cdt),
-        }
-        .fail();
-    };
-    let input_value = input_value
-        .convert_to(schema_unit)
-        .with_context(|| UnexpectedSnafu {
-            reason: format!("Failed to convert timestamp {input_value:?} to {schema_unit}"),
-        })?;
-    let ts_vector = match schema_unit {
-        TimeUnit::Second => {
-            TimestampSecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
-        }
-        TimeUnit::Millisecond => {
-            TimestampMillisecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
-        }
-        TimeUnit::Microsecond => {
-            TimestampMicrosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
-        }
-        TimeUnit::Nanosecond => {
-            TimestampNanosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
-        }
-    };
-
-    let rb = DfRecordBatch::try_new(df_schema.inner().clone(), vec![ts_vector.clone()])
-        .with_context(|_| ArrowSnafu {
-            context: format!("Failed to create record batch from {df_schema:?} and {ts_vector:?}"),
-        })?;
-
-    let eval_res = phy.evaluate(&rb).with_context(|_| DatafusionSnafu {
-        context: format!("Failed to evaluate physical expression {phy:?} on {rb:?}"),
-    })?;
-
-    if let Some(Some(ts)) = columnar_to_ts_vector(&eval_res)?.first() {
-        Ok(*ts)
-    } else {
-        UnexpectedSnafu {
-            reason: format!(
-                "Expected timestamp in expression {phy:?} but got {:?}",
-                eval_res
-            ),
-        }
-        .fail()?
-    }
-}
-
-// TODO(discord9): a method to found out the precise time window
-
-/// Find out the `Filter` Node corresponding to outermost `WHERE` and add a new filter expr to it
-#[derive(Debug)]
-pub struct AddFilterRewriter {
-    extra_filter: Expr,
-    is_rewritten: bool,
-}
-
-impl AddFilterRewriter {
-    fn new(filter: Expr) -> Self {
-        Self {
-            extra_filter: filter,
-            is_rewritten: false,
-        }
-    }
-}
-
-impl TreeNodeRewriter for AddFilterRewriter {
-    type Node = LogicalPlan;
-    fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
-        if self.is_rewritten {
-            return Ok(Transformed::no(node));
-        }
-        match node {
-            LogicalPlan::Filter(mut filter) if !filter.having => {
-                filter.predicate = filter.predicate.and(self.extra_filter.clone());
-                self.is_rewritten = true;
-                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
-            }
-            LogicalPlan::TableScan(_) => {
-                // add a new filter
-                let filter =
-                    datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
-                self.is_rewritten = true;
-                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
-            }
-            _ => Ok(Transformed::no(node)),
-        }
-    }
-}
-
-fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
-    /// A dialect that forces all identifiers to be quoted
-    struct ForceQuoteIdentifiers;
-    impl datafusion::sql::unparser::dialect::Dialect for ForceQuoteIdentifiers {
-        fn identifier_quote_style(&self, identifier: &str) -> Option<char> {
-            if identifier.to_lowercase() != identifier {
-                Some('"')
-            } else {
-                None
-            }
-        }
-    }
-    let unparser = Unparser::new(&ForceQuoteIdentifiers);
-    // first make all column qualified
-    let sql = unparser
-        .plan_to_sql(plan)
-        .with_context(|_e| DatafusionSnafu {
-            context: format!("Failed to unparse logical plan {plan:?}"),
-        })?;
-    Ok(sql.to_string())
-}
-
-#[cfg(test)]
-mod test {
-    use datafusion_common::tree_node::TreeNode;
-    use pretty_assertions::assert_eq;
-    use session::context::QueryContext;
-
-    use super::{sql_to_df_plan, *};
-    use crate::recording_rules::{df_plan_to_sql, AddFilterRewriter};
-    use crate::test_utils::create_test_query_engine;
-
-    #[tokio::test]
-    async fn test_sql_plan_convert() {
-        let query_engine = create_test_query_engine();
-        let ctx = QueryContext::arc();
-        let old = r#"SELECT "NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#;
-        let new = sql_to_df_plan(ctx.clone(), query_engine.clone(), old, false)
-            .await
-            .unwrap();
-        let new_sql = df_plan_to_sql(&new).unwrap();
-
-        assert_eq!(
-            r#"SELECT "UPPERCASE_NUMBERS_WITH_TS"."NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#,
-            new_sql
-        );
-    }
-
-    #[tokio::test]
-    async fn test_add_filter() {
-        let testcases = vec![
-            (
-                "SELECT number FROM numbers_with_ts GROUP BY number","SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
-            ),
-            (
-                "SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
-                "SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
-            ),
-            (
-                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
-                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
-            )
-        ];
-        use datafusion_expr::{col, lit};
-        let query_engine = create_test_query_engine();
-        let ctx = QueryContext::arc();
-
-        for (before, after) in testcases {
-            let sql = before;
-            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
-                .await
-                .unwrap();
-
-            let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
-            let plan = plan.rewrite(&mut add_filter).unwrap().data;
-            let new_sql = df_plan_to_sql(&plan).unwrap();
-            assert_eq!(after, new_sql);
-        }
-    }
-
-    #[tokio::test]
-    async fn test_plan_time_window_lower_bound() {
-        use datafusion_expr::{col, lit};
-        let query_engine = create_test_query_engine();
-        let ctx = QueryContext::arc();
-
-        let testcases = [
-            // same alias is not same column
-            (
-                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
-                Timestamp::new(1740394109, TimeUnit::Second),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(1740394109000, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(1740394109001, TimeUnit::Millisecond)),
-                ),
-                r#"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:29' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:48:29.001' AS TIMESTAMP))) GROUP BY numbers_with_ts.ts"#
-            ),
-            // complex time window index
-            (
-                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts GROUP BY time_window;",
-                Timestamp::new(1740394109, TimeUnit::Second),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(1740394080, TimeUnit::Second)),
-                    Some(Timestamp::new(1740394140, TimeUnit::Second)),
-                ),
-                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:00' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:49:00' AS TIMESTAMP))) GROUP BY arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)')"
-            ),
-            // no time index
-            (
-                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;",
-                Timestamp::new(23, TimeUnit::Millisecond),
-                ("ts".to_string(), None, None),
-                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;"
-            ),
-            // time index
-            (
-                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
-                Timestamp::new(23, TimeUnit::Nanosecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
-            ),
-            // on spot
-            (
-                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
-                Timestamp::new(0, TimeUnit::Nanosecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
-            ),
-            // different time unit
-            (
-                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
-                Timestamp::new(23_000_000, TimeUnit::Nanosecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
-            ),
-            // time index with other fields
-            (
-                "SELECT sum(number) as sum_up, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
-                Timestamp::new(23, TimeUnit::Millisecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT sum(numbers_with_ts.number) AS sum_up, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
-            ),
-            // time index with other pks
-            (
-                "SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number;",
-                Timestamp::new(23, TimeUnit::Millisecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number"
-            ),
-            // subquery
-            (
-                "SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
-                Timestamp::new(23, TimeUnit::Millisecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
-            ),
-            // cte
-            (
-                "with cte as (select number, date_bin('5 minutes', ts) as time_window from numbers_with_ts GROUP BY time_window, number) select number, time_window from cte;",
-                Timestamp::new(23, TimeUnit::Millisecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT cte.number, cte.time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number) AS cte"
-            ),
-            // complex subquery without alias
-            (
-                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
-                Timestamp::new(23, TimeUnit::Millisecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
-            ),
-            // complex subquery alias
-            (
-                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
-                Timestamp::new(23, TimeUnit::Millisecond),
-                (
-                    "ts".to_string(),
-                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
-                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
-                ),
-                "SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) AS cte GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
-            ),
-        ];
-
-        for (sql, current, expected, expected_unparsed) in testcases {
-            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, true)
-                .await
-                .unwrap();
-
-            let real =
-                find_plan_time_window_bound(&plan, current, ctx.clone(), query_engine.clone())
-                    .await
-                    .unwrap();
-            assert_eq!(expected, real);
-
-            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
-                .await
-                .unwrap();
-            let (col_name, lower, upper) = real;
-            let new_sql = if lower.is_some() {
-                let to_df_literal = |value| {
-                    let value = Value::from(value);
-
-                    value.try_to_scalar_value(&value.data_type()).unwrap()
-                };
-                let lower = to_df_literal(lower.unwrap());
-                let upper = to_df_literal(upper.unwrap());
-                let expr = col(&col_name)
-                    .gt_eq(lit(lower))
-                    .and(col(&col_name).lt_eq(lit(upper)));
-                let mut add_filter = AddFilterRewriter::new(expr);
-                let plan = plan.rewrite(&mut add_filter).unwrap().data;
-                df_plan_to_sql(&plan).unwrap()
-            } else {
-                sql.to_string()
-            };
-            assert_eq!(expected_unparsed, new_sql);
-        }
-    }
-}
--- a/src/flow/src/recording_rules/engine.rs
+++ b/src/flow/src/recording_rules/engine.rs
@@ -1,815 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::{BTreeMap, HashMap, HashSet};
-use std::sync::Arc;
-use std::time::{Duration, SystemTime, UNIX_EPOCH};
-
-use api::v1::flow::FlowResponse;
-use common_error::ext::BoxedError;
-use common_meta::ddl::create_flow::FlowType;
-use common_meta::key::flow::FlowMetadataManagerRef;
-use common_meta::key::table_info::TableInfoManager;
-use common_meta::key::TableMetadataManagerRef;
-use common_telemetry::tracing::warn;
-use common_telemetry::{debug, info};
-use common_time::Timestamp;
-use datafusion::sql::unparser::expr_to_sql;
-use datafusion_common::tree_node::TreeNode;
-use datatypes::value::Value;
-use query::QueryEngineRef;
-use session::context::QueryContextRef;
-use snafu::{ensure, OptionExt, ResultExt};
-use store_api::storage::RegionId;
-use table::metadata::TableId;
-use tokio::sync::oneshot::error::TryRecvError;
-use tokio::sync::{oneshot, RwLock};
-use tokio::time::Instant;
-
-use super::frontend_client::FrontendClient;
-use super::{df_plan_to_sql, AddFilterRewriter, TimeWindowExpr};
-use crate::adapter::{CreateFlowArgs, FlowId, TableName};
-use crate::error::{
-    DatafusionSnafu, DatatypesSnafu, ExternalSnafu, FlowAlreadyExistSnafu, InternalSnafu,
-    TimeSnafu, UnexpectedSnafu,
-};
-use crate::metrics::{METRIC_FLOW_RULE_ENGINE_QUERY_TIME, METRIC_FLOW_RULE_ENGINE_SLOW_QUERY};
-use crate::recording_rules::{find_time_window_expr, sql_to_df_plan};
-use crate::Error;
-
-/// TODO(discord9): make those constants configurable
-/// The default rule engine query timeout is 10 minutes
-pub const DEFAULT_RULE_ENGINE_QUERY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
-
-/// will output a warn log for any query that runs for more that 1 minutes, and also every 1 minutes when that query is still running
-pub const SLOW_QUERY_THRESHOLD: Duration = Duration::from_secs(60);
-
-/// TODO(discord9): determine how to configure refresh rate
-pub struct RecordingRuleEngine {
-    tasks: RwLock<BTreeMap<FlowId, RecordingRuleTask>>,
-    shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
-    frontend_client: Arc<FrontendClient>,
-    flow_metadata_manager: FlowMetadataManagerRef,
-    table_meta: TableMetadataManagerRef,
-    engine: QueryEngineRef,
-}
-
-impl RecordingRuleEngine {
-    pub fn new(
-        frontend_client: Arc<FrontendClient>,
-        engine: QueryEngineRef,
-        flow_metadata_manager: FlowMetadataManagerRef,
-        table_meta: TableMetadataManagerRef,
-    ) -> Self {
-        Self {
-            tasks: Default::default(),
-            shutdown_txs: Default::default(),
-            frontend_client,
-            flow_metadata_manager,
-            table_meta,
-            engine,
-        }
-    }
-
-    pub async fn handle_inserts(
-        &self,
-        request: api::v1::region::InsertRequests,
-    ) -> Result<FlowResponse, Error> {
-        let table_info_mgr = self.table_meta.table_info_manager();
-        let mut group_by_table_name: HashMap<TableName, Vec<api::v1::Rows>> = HashMap::new();
-        for r in request.requests {
-            let tid = RegionId::from(r.region_id).table_id();
-            let name = get_table_name(table_info_mgr, &tid).await?;
-            let entry = group_by_table_name.entry(name).or_default();
-            if let Some(rows) = r.rows {
-                entry.push(rows);
-            }
-        }
-
-        for (_flow_id, task) in self.tasks.read().await.iter() {
-            let src_table_names = &task.source_table_names;
-
-            for src_table_name in src_table_names {
-                if let Some(entry) = group_by_table_name.get(src_table_name) {
-                    let Some(expr) = &task.time_window_expr else {
-                        continue;
-                    };
-                    let involved_time_windows = expr.handle_rows(entry.clone()).await?;
-                    let mut state = task.state.write().await;
-                    state
-                        .dirty_time_windows
-                        .add_lower_bounds(involved_time_windows.into_iter());
-                }
-            }
-        }
-
-        Ok(Default::default())
-    }
-}
-
-async fn get_table_name(zelf: &TableInfoManager, table_id: &TableId) -> Result<TableName, Error> {
-    zelf.get(*table_id)
-        .await
-        .map_err(BoxedError::new)
-        .context(ExternalSnafu)?
-        .with_context(|| UnexpectedSnafu {
-            reason: format!("Table id = {:?}, couldn't found table name", table_id),
-        })
-        .map(|name| name.table_name())
-        .map(|name| [name.catalog_name, name.schema_name, name.table_name])
-}
-
-const MIN_REFRESH_DURATION: Duration = Duration::new(5, 0);
-
-impl RecordingRuleEngine {
-    pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
-        let CreateFlowArgs {
-            flow_id,
-            sink_table_name,
-            source_table_ids,
-            create_if_not_exists,
-            or_replace,
-            expire_after,
-            comment: _,
-            sql,
-            flow_options,
-            query_ctx,
-        } = args;
-
-        // or replace logic
-        {
-            let is_exist = self.tasks.read().await.contains_key(&flow_id);
-            match (create_if_not_exists, or_replace, is_exist) {
-                // if replace, ignore that old flow exists
-                (_, true, true) => {
-                    info!("Replacing flow with id={}", flow_id);
-                }
-                (false, false, true) => FlowAlreadyExistSnafu { id: flow_id }.fail()?,
-                // already exists, and not replace, return None
-                (true, false, true) => {
-                    info!("Flow with id={} already exists, do nothing", flow_id);
-                    return Ok(None);
-                }
-
-                // continue as normal
-                (_, _, false) => (),
-            }
-        }
-
-        let flow_type = flow_options.get(FlowType::FLOW_TYPE_KEY);
-
-        ensure!(
-            flow_type == Some(&FlowType::RecordingRule.to_string()) || flow_type.is_none(),
-            UnexpectedSnafu {
-                reason: format!("Flow type is not RecordingRule nor None, got {flow_type:?}")
-            }
-        );
-
-        let Some(query_ctx) = query_ctx else {
-            UnexpectedSnafu {
-                reason: "Query context is None".to_string(),
-            }
-            .fail()?
-        };
-        let query_ctx = Arc::new(query_ctx);
-        let mut source_table_names = Vec::new();
-        for src_id in source_table_ids {
-            let table_name = self
-                .table_meta
-                .table_info_manager()
-                .get(src_id)
-                .await
-                .map_err(BoxedError::new)
-                .context(ExternalSnafu)?
-                .with_context(|| UnexpectedSnafu {
-                    reason: format!("Table id = {:?}, couldn't found table name", src_id),
-                })
-                .map(|name| name.table_name())
-                .map(|name| [name.catalog_name, name.schema_name, name.table_name])?;
-            source_table_names.push(table_name);
-        }
-
-        let (tx, rx) = oneshot::channel();
-
-        let plan = sql_to_df_plan(query_ctx.clone(), self.engine.clone(), &sql, true).await?;
-        let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
-            &plan,
-            self.engine.engine_state().catalog_manager().clone(),
-            query_ctx.clone(),
-        )
-        .await?;
-
-        let phy_expr = time_window_expr
-            .map(|expr| TimeWindowExpr::from_expr(&expr, &column_name, &df_schema))
-            .transpose()?;
-
-        info!("Flow id={}, found time window expr={:?}", flow_id, phy_expr);
-
-        let task = RecordingRuleTask::new(
-            flow_id,
-            &sql,
-            phy_expr,
-            expire_after,
-            sink_table_name,
-            source_table_names,
-            query_ctx,
-            rx,
-        );
-
-        let task_inner = task.clone();
-        let engine = self.engine.clone();
-        let frontend = self.frontend_client.clone();
-
-        // TODO(discord9): also save handle & use time wheel or what for better
-        let _handle = common_runtime::spawn_global(async move {
-            match task_inner.start_executing(engine, frontend).await {
-                Ok(()) => info!("Flow {} shutdown", task_inner.flow_id),
-                Err(err) => common_telemetry::error!(
-                    "Flow {} encounter unrecoverable error: {err:?}",
-                    task_inner.flow_id
-                ),
-            }
-        });
-
-        // TODO(discord9): deal with replace logic
-        let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
-        drop(replaced_old_task_opt);
-
-        self.shutdown_txs.write().await.insert(flow_id, tx);
-
-        Ok(Some(flow_id))
-    }
-
-    pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
-        if self.tasks.write().await.remove(&flow_id).is_none() {
-            warn!("Flow {flow_id} not found in tasks")
-        }
-        let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
-            UnexpectedSnafu {
-                reason: format!("Can't found shutdown tx for flow {flow_id}"),
-            }
-            .fail()?
-        };
-        if tx.send(()).is_err() {
-            warn!("Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?")
-        }
-        Ok(())
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct RecordingRuleTask {
-    pub flow_id: FlowId,
-    query: String,
-    pub time_window_expr: Option<TimeWindowExpr>,
-    /// in seconds
-    pub expire_after: Option<i64>,
-    sink_table_name: [String; 3],
-    source_table_names: HashSet<[String; 3]>,
-    state: Arc<RwLock<RecordingRuleState>>,
-}
-
-impl RecordingRuleTask {
-    #[allow(clippy::too_many_arguments)]
-    pub fn new(
-        flow_id: FlowId,
-        query: &str,
-        time_window_expr: Option<TimeWindowExpr>,
-        expire_after: Option<i64>,
-        sink_table_name: [String; 3],
-        source_table_names: Vec<[String; 3]>,
-        query_ctx: QueryContextRef,
-        shutdown_rx: oneshot::Receiver<()>,
-    ) -> Self {
-        Self {
-            flow_id,
-            query: query.to_string(),
-            time_window_expr,
-            expire_after,
-            sink_table_name,
-            source_table_names: source_table_names.into_iter().collect(),
-            state: Arc::new(RwLock::new(RecordingRuleState::new(query_ctx, shutdown_rx))),
-        }
-    }
-}
-impl RecordingRuleTask {
-    /// This should be called in a new tokio task
-    pub async fn start_executing(
-        &self,
-        engine: QueryEngineRef,
-        frontend_client: Arc<FrontendClient>,
-    ) -> Result<(), Error> {
-        // only first query don't need upper bound
-        let mut is_first = true;
-
-        loop {
-            // FIXME(discord9): test if need upper bound also works
-            let new_query = self.gen_query_with_time_window(engine.clone()).await?;
-
-            let insert_into = if let Some(new_query) = new_query {
-                format!(
-                    "INSERT INTO {}.{}.{} {}",
-                    self.sink_table_name[0],
-                    self.sink_table_name[1],
-                    self.sink_table_name[2],
-                    new_query
-                )
-            } else {
-                tokio::time::sleep(MIN_REFRESH_DURATION).await;
-                continue;
-            };
-
-            if is_first {
-                is_first = false;
-            }
-
-            let instant = Instant::now();
-            let flow_id = self.flow_id;
-            let db_client = frontend_client.get_database_client().await?;
-            let peer_addr = db_client.peer.addr;
-            debug!(
-                "Executing flow {flow_id}(expire_after={:?} secs) on {:?} with query {}",
-                self.expire_after, peer_addr, &insert_into
-            );
-
-            let timer = METRIC_FLOW_RULE_ENGINE_QUERY_TIME
-                .with_label_values(&[flow_id.to_string().as_str()])
-                .start_timer();
-
-            let res = db_client.database.sql(&insert_into).await;
-            drop(timer);
-
-            let elapsed = instant.elapsed();
-            if let Ok(res1) = &res {
-                debug!(
-                    "Flow {flow_id} executed, result: {res1:?}, elapsed: {:?}",
-                    elapsed
-                );
-            } else if let Err(res) = &res {
-                warn!(
-                    "Failed to execute Flow {flow_id} on frontend {}, result: {res:?}, elapsed: {:?} with query: {}",
-                    peer_addr, elapsed, &insert_into
-                );
-            }
-
-            // record slow query
-            if elapsed >= SLOW_QUERY_THRESHOLD {
-                warn!(
-                    "Flow {flow_id} on frontend {} executed for {:?} before complete, query: {}",
-                    peer_addr, elapsed, &insert_into
-                );
-                METRIC_FLOW_RULE_ENGINE_SLOW_QUERY
-                    .with_label_values(&[flow_id.to_string().as_str(), &insert_into, &peer_addr])
-                    .observe(elapsed.as_secs_f64());
-            }
-
-            self.state
-                .write()
-                .await
-                .after_query_exec(elapsed, res.is_ok());
-            // drop the result to free client-related resources
-            drop(res);
-
-            let sleep_until = {
-                let mut state = self.state.write().await;
-                match state.shutdown_rx.try_recv() {
-                    Ok(()) => break Ok(()),
-                    Err(TryRecvError::Closed) => {
-                        warn!("Unexpected shutdown flow {flow_id}, shutdown anyway");
-                        break Ok(());
-                    }
-                    Err(TryRecvError::Empty) => (),
-                }
-                state.get_next_start_query_time(None)
-            };
-            tokio::time::sleep_until(sleep_until).await;
-        }
-    }
-
-    /// will merge and use the first ten time window in query
-    async fn gen_query_with_time_window(
-        &self,
-        engine: QueryEngineRef,
-    ) -> Result<Option<String>, Error> {
-        let query_ctx = self.state.read().await.query_ctx.clone();
-        let start = SystemTime::now();
-        let since_the_epoch = start
-            .duration_since(UNIX_EPOCH)
-            .expect("Time went backwards");
-        let low_bound = self
-            .expire_after
-            .map(|e| since_the_epoch.as_secs() - e as u64)
-            .unwrap_or(u64::MIN);
-
-        let low_bound = Timestamp::new_second(low_bound as i64);
-
-        // TODO(discord9): use time window expr to get the precise expire lower bound
-        let expire_time_window_bound = self
-            .time_window_expr
-            .as_ref()
-            .map(|expr| expr.eval(low_bound))
-            .transpose()?;
-
-        let new_sql = {
-            let expr = {
-                match expire_time_window_bound {
-                    Some((Some(l), Some(u))) => {
-                        let window_size = u.sub(&l).with_context(|| UnexpectedSnafu {
-                            reason: format!("Can't get window size from {u:?} - {l:?}"),
-                        })?;
-                        let col_name = self
-                            .time_window_expr
-                            .as_ref()
-                            .map(|expr| expr.column_name.clone())
-                            .with_context(|| UnexpectedSnafu {
-                                reason: format!(
-                                    "Flow id={:?}, Failed to get column name from time window expr",
-                                    self.flow_id
-                                ),
-                            })?;
-
-                        self.state
-                            .write()
-                            .await
-                            .dirty_time_windows
-                            .gen_filter_exprs(&col_name, Some(l), window_size, self)?
-                    }
-                    _ => {
-                        debug!(
-                            "Flow id = {:?}, can't get window size: precise_lower_bound={expire_time_window_bound:?}, using the same query", self.flow_id
-                        );
-                        // since no time window lower/upper bound is found, just return the original query
-                        return Ok(Some(self.query.clone()));
-                    }
-                }
-            };
-
-            debug!(
-                "Flow id={:?}, Generated filter expr: {:?}",
-                self.flow_id,
-                expr.as_ref()
-                    .map(|expr| expr_to_sql(expr).with_context(|_| DatafusionSnafu {
-                        context: format!("Failed to generate filter expr from {expr:?}"),
-                    }))
-                    .transpose()?
-                    .map(|s| s.to_string())
-            );
-
-            let Some(expr) = expr else {
-                // no new data, hence no need to update
-                debug!("Flow id={:?}, no new data, not update", self.flow_id);
-                return Ok(None);
-            };
-
-            let mut add_filter = AddFilterRewriter::new(expr);
-            // make a not optimized plan for clearer unparse
-            let plan =
-                sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.query, false).await?;
-            let plan = plan
-                .clone()
-                .rewrite(&mut add_filter)
-                .with_context(|_| DatafusionSnafu {
-                    context: format!("Failed to rewrite plan {plan:?}"),
-                })?
-                .data;
-            df_plan_to_sql(&plan)?
-        };
-
-        Ok(Some(new_sql))
-    }
-}
-
-#[derive(Debug)]
-pub struct RecordingRuleState {
-    query_ctx: QueryContextRef,
-    /// last query complete time
-    last_update_time: Instant,
-    /// last time query duration
-    last_query_duration: Duration,
-    /// Dirty Time windows need to be updated
-    /// mapping of `start -> end` and non-overlapping
-    dirty_time_windows: DirtyTimeWindows,
-    exec_state: ExecState,
-    shutdown_rx: oneshot::Receiver<()>,
-}
-
-#[derive(Debug, Clone, Default)]
-pub struct DirtyTimeWindows {
-    windows: BTreeMap<Timestamp, Option<Timestamp>>,
-}
-
-fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
-    let value = Value::from(value);
-    let value = value
-        .try_to_scalar_value(&value.data_type())
-        .with_context(|_| DatatypesSnafu {
-            extra: format!("Failed to convert to scalar value: {}", value),
-        })?;
-    Ok(value)
-}
-
-impl DirtyTimeWindows {
-    /// Time window merge distance
-    const MERGE_DIST: i32 = 3;
-
-    /// Maximum number of filters allowed in a single query
-    const MAX_FILTER_NUM: usize = 20;
-
-    /// Add lower bounds to the dirty time windows. Upper bounds are ignored.
-    ///
-    /// # Arguments
-    ///
-    /// * `lower_bounds` - An iterator of lower bounds to be added.
-    pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
-        for lower_bound in lower_bounds {
-            let entry = self.windows.entry(lower_bound);
-            entry.or_insert(None);
-        }
-    }
-
-    /// Generate all filter expressions consuming all time windows
-    pub fn gen_filter_exprs(
-        &mut self,
-        col_name: &str,
-        expire_lower_bound: Option<Timestamp>,
-        window_size: chrono::Duration,
-        task_ctx: &RecordingRuleTask,
-    ) -> Result<Option<datafusion_expr::Expr>, Error> {
-        debug!(
-            "expire_lower_bound: {:?}, window_size: {:?}",
-            expire_lower_bound.map(|t| t.to_iso8601_string()),
-            window_size
-        );
-        self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
-
-        if self.windows.len() > Self::MAX_FILTER_NUM {
-            let first_time_window = self.windows.first_key_value();
-            let last_time_window = self.windows.last_key_value();
-            warn!(
-                "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
-                task_ctx.flow_id,
-                self.windows.len(),
-                Self::MAX_FILTER_NUM,
-                task_ctx.time_window_expr,
-                task_ctx.expire_after,
-                first_time_window,
-                last_time_window,
-                task_ctx.query
-            );
-        }
-
-        // get the first `MAX_FILTER_NUM` time windows
-        let nth = self
-            .windows
-            .iter()
-            .nth(Self::MAX_FILTER_NUM)
-            .map(|(key, _)| *key);
-        let first_nth = {
-            if let Some(nth) = nth {
-                let mut after = self.windows.split_off(&nth);
-                std::mem::swap(&mut self.windows, &mut after);
-
-                after
-            } else {
-                std::mem::take(&mut self.windows)
-            }
-        };
-
-        let mut expr_lst = vec![];
-        for (start, end) in first_nth.into_iter() {
-            debug!(
-                "Time window start: {:?}, end: {:?}",
-                start.to_iso8601_string(),
-                end.map(|t| t.to_iso8601_string())
-            );
-
-            use datafusion_expr::{col, lit};
-            let lower = to_df_literal(start)?;
-            let upper = end.map(to_df_literal).transpose()?;
-            let expr = if let Some(upper) = upper {
-                col(col_name)
-                    .gt_eq(lit(lower))
-                    .and(col(col_name).lt(lit(upper)))
-            } else {
-                col(col_name).gt_eq(lit(lower))
-            };
-            expr_lst.push(expr);
-        }
-        let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
-        Ok(expr)
-    }
-
-    /// Merge time windows that overlaps or get too close
-    pub fn merge_dirty_time_windows(
-        &mut self,
-        window_size: chrono::Duration,
-        expire_lower_bound: Option<Timestamp>,
-    ) -> Result<(), Error> {
-        let mut new_windows = BTreeMap::new();
-
-        let mut prev_tw = None;
-        for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
-            // filter out expired time window
-            if let Some(expire_lower_bound) = expire_lower_bound {
-                if lower_bound <= expire_lower_bound {
-                    continue;
-                }
-            }
-
-            let Some(prev_tw) = &mut prev_tw else {
-                prev_tw = Some((lower_bound, upper_bound));
-                continue;
-            };
-
-            let std_window_size = window_size.to_std().map_err(|e| {
-                InternalSnafu {
-                    reason: e.to_string(),
-                }
-                .build()
-            })?;
-
-            // if cur.lower - prev.upper <= window_size * 2, merge
-            let prev_upper = prev_tw
-                .1
-                .unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
-            prev_tw.1 = Some(prev_upper);
-
-            let cur_upper = upper_bound.unwrap_or(
-                lower_bound
-                    .add_duration(std_window_size)
-                    .context(TimeSnafu)?,
-            );
-
-            if lower_bound
-                .sub(&prev_upper)
-                .map(|dist| dist <= window_size * Self::MERGE_DIST)
-                .unwrap_or(false)
-            {
-                prev_tw.1 = Some(cur_upper);
-            } else {
-                new_windows.insert(prev_tw.0, prev_tw.1);
-                *prev_tw = (lower_bound, Some(cur_upper));
-            }
-        }
-
-        if let Some(prev_tw) = prev_tw {
-            new_windows.insert(prev_tw.0, prev_tw.1);
-        }
-
-        self.windows = new_windows;
-
-        Ok(())
-    }
-}
-
-impl RecordingRuleState {
-    pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
-        Self {
-            query_ctx,
-            last_update_time: Instant::now(),
-            last_query_duration: Duration::from_secs(0),
-            dirty_time_windows: Default::default(),
-            exec_state: ExecState::Idle,
-            shutdown_rx,
-        }
-    }
-
-    /// called after last query is done
-    /// `is_succ` indicate whether the last query is successful
-    pub fn after_query_exec(&mut self, elapsed: Duration, _is_succ: bool) {
-        self.exec_state = ExecState::Idle;
-        self.last_query_duration = elapsed;
-        self.last_update_time = Instant::now();
-    }
-
-    /// wait for at least `last_query_duration`, at most `max_timeout` to start next query
-    pub fn get_next_start_query_time(&self, max_timeout: Option<Duration>) -> Instant {
-        let next_duration = max_timeout
-            .unwrap_or(self.last_query_duration)
-            .min(self.last_query_duration);
-        let next_duration = next_duration.max(MIN_REFRESH_DURATION);
-
-        self.last_update_time + next_duration
-    }
-}
-
-#[derive(Debug, Clone)]
-enum ExecState {
-    Idle,
-    Executing,
-}
-
-#[cfg(test)]
-mod test {
-    use pretty_assertions::assert_eq;
-
-    use super::*;
-
-    #[test]
-    fn test_merge_dirty_time_windows() {
-        let mut dirty = DirtyTimeWindows::default();
-        dirty.add_lower_bounds(
-            vec![
-                Timestamp::new_second(0),
-                Timestamp::new_second((1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
-            ]
-            .into_iter(),
-        );
-        dirty
-            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
-            .unwrap();
-        // just enough to merge
-        assert_eq!(
-            dirty.windows,
-            BTreeMap::from([(
-                Timestamp::new_second(0),
-                Some(Timestamp::new_second(
-                    (2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
-                ))
-            )])
-        );
-
-        // separate time window
-        let mut dirty = DirtyTimeWindows::default();
-        dirty.add_lower_bounds(
-            vec![
-                Timestamp::new_second(0),
-                Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
-            ]
-            .into_iter(),
-        );
-        dirty
-            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
-            .unwrap();
-        // just enough to merge
-        assert_eq!(
-            BTreeMap::from([
-                (
-                    Timestamp::new_second(0),
-                    Some(Timestamp::new_second(5 * 60))
-                ),
-                (
-                    Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
-                    Some(Timestamp::new_second(
-                        (3 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
-                    ))
-                )
-            ]),
-            dirty.windows
-        );
-
-        // overlapping
-        let mut dirty = DirtyTimeWindows::default();
-        dirty.add_lower_bounds(
-            vec![
-                Timestamp::new_second(0),
-                Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
-            ]
-            .into_iter(),
-        );
-        dirty
-            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
-            .unwrap();
-        // just enough to merge
-        assert_eq!(
-            BTreeMap::from([(
-                Timestamp::new_second(0),
-                Some(Timestamp::new_second(
-                    (1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
-                ))
-            ),]),
-            dirty.windows
-        );
-
-        // expired
-        let mut dirty = DirtyTimeWindows::default();
-        dirty.add_lower_bounds(
-            vec![
-                Timestamp::new_second(0),
-                Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
-            ]
-            .into_iter(),
-        );
-        dirty
-            .merge_dirty_time_windows(
-                chrono::Duration::seconds(5 * 60),
-                Some(Timestamp::new_second(
-                    (DirtyTimeWindows::MERGE_DIST as i64) * 6 * 60,
-                )),
-            )
-            .unwrap();
-        // just enough to merge
-        assert_eq!(BTreeMap::from([]), dirty.windows);
-    }
-}
--- a/src/flow/src/recording_rules/frontend_client.rs
+++ b/src/flow/src/recording_rules/frontend_client.rs
@@ -1,163 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Frontend client to run flow as recording rule which is time-window-aware normal query triggered every tick set by user
-
-use std::sync::Arc;
-
-use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
-use common_error::ext::BoxedError;
-use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
-use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
-use common_meta::peer::Peer;
-use common_meta::rpc::store::RangeRequest;
-use meta_client::client::MetaClient;
-use snafu::ResultExt;
-
-use crate::error::{ExternalSnafu, UnexpectedSnafu};
-use crate::recording_rules::engine::DEFAULT_RULE_ENGINE_QUERY_TIMEOUT;
-use crate::Error;
-
-fn default_channel_mgr() -> ChannelManager {
-    let cfg = ChannelConfig::new().timeout(DEFAULT_RULE_ENGINE_QUERY_TIMEOUT);
-    ChannelManager::with_config(cfg)
-}
-
-fn client_from_urls(addrs: Vec<String>) -> Client {
-    Client::with_manager_and_urls(default_channel_mgr(), addrs)
-}
-
-/// A simple frontend client able to execute sql using grpc protocol
-#[derive(Debug)]
-pub enum FrontendClient {
-    Distributed {
-        meta_client: Arc<MetaClient>,
-        channel_mgr: ChannelManager,
-    },
-    Standalone {
-        /// for the sake of simplicity still use grpc even in standalone mode
-        /// notice the client here should all be lazy, so that can wait after frontend is booted then make conn
-        /// TODO(discord9): not use grpc under standalone mode
-        database_client: DatabaseWithPeer,
-    },
-}
-
-#[derive(Debug, Clone)]
-pub struct DatabaseWithPeer {
-    pub database: Database,
-    pub peer: Peer,
-}
-
-impl DatabaseWithPeer {
-    fn new(database: Database, peer: Peer) -> Self {
-        Self { database, peer }
-    }
-}
-
-impl FrontendClient {
-    pub fn from_meta_client(meta_client: Arc<MetaClient>) -> Self {
-        Self::Distributed {
-            meta_client,
-            channel_mgr: default_channel_mgr(),
-        }
-    }
-
-    pub fn from_static_grpc_addr(addr: String) -> Self {
-        let peer = Peer {
-            id: 0,
-            addr: addr.clone(),
-        };
-
-        let mgr = default_channel_mgr();
-        let client = Client::with_manager_and_urls(mgr.clone(), vec![addr]);
-        let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
-        Self::Standalone {
-            database_client: DatabaseWithPeer::new(database, peer),
-        }
-    }
-}
-
-impl FrontendClient {
-    async fn scan_for_frontend(&self) -> Result<Vec<(NodeInfoKey, NodeInfo)>, Error> {
-        let Self::Distributed { meta_client, .. } = self else {
-            return Ok(vec![]);
-        };
-        let cluster_client = meta_client
-            .cluster_client()
-            .map_err(BoxedError::new)
-            .context(ExternalSnafu)?;
-
-        let prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend);
-        let req = RangeRequest::new().with_prefix(prefix);
-        let resp = cluster_client
-            .range(req)
-            .await
-            .map_err(BoxedError::new)
-            .context(ExternalSnafu)?;
-        let mut res = Vec::with_capacity(resp.kvs.len());
-        for kv in resp.kvs {
-            let key = NodeInfoKey::try_from(kv.key)
-                .map_err(BoxedError::new)
-                .context(ExternalSnafu)?;
-
-            let val = NodeInfo::try_from(kv.value)
-                .map_err(BoxedError::new)
-                .context(ExternalSnafu)?;
-            res.push((key, val));
-        }
-        Ok(res)
-    }
-
-    /// Get the database with max `last_activity_ts`
-    async fn get_last_active_frontend(&self) -> Result<DatabaseWithPeer, Error> {
-        if let Self::Standalone { database_client } = self {
-            return Ok(database_client.clone());
-        }
-        match &self {
-            Self::Standalone { database_client } => Ok(database_client.clone()),
-            Self::Distributed {
-                meta_client: _,
-                channel_mgr,
-            } => {
-                let frontends = self.scan_for_frontend().await?;
-                let mut last_activity_ts = i64::MIN;
-                let mut peer = None;
-                for (_key, val) in frontends.iter() {
-                    if val.last_activity_ts > last_activity_ts {
-                        last_activity_ts = val.last_activity_ts;
-                        peer = Some(val.peer.clone());
-                    }
-                }
-                let Some(peer) = peer else {
-                    UnexpectedSnafu {
-                        reason: format!("No frontend available: {:?}", frontends),
-                    }
-                    .fail()?
-                };
-                let client =
-                    Client::with_manager_and_urls(channel_mgr.clone(), vec![peer.addr.clone()]);
-                let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
-                Ok(DatabaseWithPeer::new(database, peer))
-            }
-        }
-    }
-
-    /// Get a database client, and possibly update it before returning.
-    pub async fn get_database_client(&self) -> Result<DatabaseWithPeer, Error> {
-        match self {
-            Self::Standalone { database_client } => Ok(database_client.clone()),
-            Self::Distributed { meta_client: _, .. } => self.get_last_active_frontend().await,
-        }
-    }
-}
--- a/src/flow/src/server.rs
+++ b/src/flow/src/server.rs
@@ -57,7 +57,6 @@ use crate::error::{
 };
 use crate::heartbeat::HeartbeatTask;
 use crate::metrics::{METRIC_FLOW_PROCESSING_TIME, METRIC_FLOW_ROWS};
-use crate::recording_rules::{FrontendClient, RecordingRuleEngine};
 use crate::transform::register_function_to_query_engine;
 use crate::utils::{SizeReportSender, StateReportHandler};
 use crate::{Error, FlowWorkerManager, FlownodeOptions};
@@ -246,7 +245,6 @@ impl FlownodeInstance {
        self.server.shutdown().await.context(ShutdownServerSnafu)?;

        if let Some(task) = &self.heartbeat_task {
-            info!("Close heartbeat task for flownode");
            task.shutdown();
        }

@@ -273,8 +271,6 @@ pub struct FlownodeBuilder {
    heartbeat_task: Option<HeartbeatTask>,
    /// receive a oneshot sender to send state size report
    state_report_handler: Option<StateReportHandler>,
-    /// Client to send sql to frontend
-    frontend_client: Arc<FrontendClient>,
 }

 impl FlownodeBuilder {
@@ -285,7 +281,6 @@ impl FlownodeBuilder {
        table_meta: TableMetadataManagerRef,
        catalog_manager: CatalogManagerRef,
        flow_metadata_manager: FlowMetadataManagerRef,
-        frontend_client: Arc<FrontendClient>,
    ) -> Self {
        Self {
            opts,
@@ -295,7 +290,6 @@ impl FlownodeBuilder {
            flow_metadata_manager,
            heartbeat_task: None,
            state_report_handler: None,
-            frontend_client,
        }
    }

@@ -453,14 +447,7 @@ impl FlownodeBuilder {

        let node_id = self.opts.node_id.map(|id| id as u32);

-        let rule_engine = RecordingRuleEngine::new(
-            self.frontend_client.clone(),
-            query_engine.clone(),
-            self.flow_metadata_manager.clone(),
-            table_meta.clone(),
-        );
-
-        let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta, rule_engine);
+        let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta);
        for worker_id in 0..num_workers {
            let (tx, rx) = oneshot::channel();

--- a/src/flow/src/test_utils.rs
+++ b/src/flow/src/test_utils.rs
@@ -86,8 +86,7 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {

    let schema = vec![
        datatypes::schema::ColumnSchema::new("number", CDT::uint32_datatype(), false),
-        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
-            .with_time_index(true),
+        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false),
    ];
    let mut columns = vec![];
    let numbers = (1..=10).collect_vec();
@@ -115,37 +114,6 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
    };
    catalog_list.register_table_sync(req_with_ts).unwrap();

-    let schema = vec![
-        datatypes::schema::ColumnSchema::new("NUMBER", CDT::uint32_datatype(), false),
-        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
-            .with_time_index(true),
-    ];
-    let mut columns = vec![];
-    let numbers = (1..=10).collect_vec();
-    let column: VectorRef = Arc::new(<u32 as Scalar>::VectorType::from_vec(numbers));
-    columns.push(column);
-
-    let ts = (1..=10).collect_vec();
-    let mut builder = TimestampMillisecondVectorBuilder::with_capacity(10);
-    ts.into_iter()
-        .map(|v| builder.push(Some(TimestampMillisecond::new(v))))
-        .count();
-    let column: VectorRef = builder.to_vector_cloned();
-    columns.push(column);
-
-    let schema = Arc::new(Schema::new(schema));
-    let recordbatch = common_recordbatch::RecordBatch::new(schema, columns).unwrap();
-    let table = MemTable::table("UPPERCASE_NUMBERS_WITH_TS", recordbatch);
-
-    let req_with_ts = RegisterTableRequest {
-        catalog: DEFAULT_CATALOG_NAME.to_string(),
-        schema: DEFAULT_SCHEMA_NAME.to_string(),
-        table_name: "UPPERCASE_NUMBERS_WITH_TS".to_string(),
-        table_id: 1025,
-        table,
-    };
-    catalog_list.register_table_sync(req_with_ts).unwrap();
-
    let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);

    let engine = factory.query_engine();
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -238,13 +238,6 @@ pub enum Error {
        source: servers::error::Error,
    },

-    #[snafu(display("Failed to create logical plan for prometheus label values query"))]
-    PrometheusLabelValuesQueryPlan {
-        #[snafu(implicit)]
-        location: Location,
-        source: query::promql::error::Error,
-    },
-
    #[snafu(display("Failed to describe schema for given statement"))]
    DescribeStatement {
        #[snafu(implicit)]
@@ -373,8 +366,6 @@ impl ErrorExt for Error {
            | Error::PrometheusMetricNamesQueryPlan { source, .. }
            | Error::ExecutePromql { source, .. } => source.status_code(),

-            Error::PrometheusLabelValuesQueryPlan { source, .. } => source.status_code(),
-
            Error::CollectRecordbatch { .. } => StatusCode::EngineExecuteQuery,

            Error::SqlExecIntercepted { source, .. } => source.status_code(),
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -26,7 +26,6 @@ mod region_query;
 pub mod standalone;

 use std::sync::Arc;
-use std::time::SystemTime;

 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
@@ -472,21 +471,6 @@ impl PrometheusHandler for Instance {
            .context(ExecuteQuerySnafu)
    }

-    async fn query_label_values(
-        &self,
-        metric: String,
-        label_name: String,
-        matchers: Vec<Matcher>,
-        start: SystemTime,
-        end: SystemTime,
-        ctx: &QueryContextRef,
-    ) -> server_error::Result<Vec<String>> {
-        self.handle_query_label_values(metric, label_name, matchers, start, end, ctx)
-            .await
-            .map_err(BoxedError::new)
-            .context(ExecuteQuerySnafu)
-    }
-
    fn catalog_manager(&self) -> CatalogManagerRef {
        self.catalog_manager.clone()
    }
--- a/src/frontend/src/instance/promql.rs
+++ b/src/frontend/src/instance/promql.rs
@@ -12,26 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::time::SystemTime;
-
 use catalog::information_schema::TABLES;
 use client::OutputData;
 use common_catalog::consts::INFORMATION_SCHEMA_NAME;
-use common_catalog::format_full_table_name;
 use common_recordbatch::util;
 use common_telemetry::tracing;
 use datatypes::prelude::Value;
-use promql_parser::label::{Matcher, Matchers};
-use query::promql;
-use query::promql::planner::PromPlanner;
+use promql_parser::label::Matcher;
 use servers::prometheus;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};

 use crate::error::{
    CatalogSnafu, CollectRecordbatchSnafu, ExecLogicalPlanSnafu,
-    PrometheusLabelValuesQueryPlanSnafu, PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu,
-    Result, TableNotFoundSnafu,
+    PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu, Result, TableNotFoundSnafu,
 };
 use crate::instance::Instance;

@@ -102,77 +96,4 @@ impl Instance {

        Ok(results)
    }
-
-    /// Handles label values query request, returns the values.
-    #[tracing::instrument(skip_all)]
-    pub(crate) async fn handle_query_label_values(
-        &self,
-        metric: String,
-        label_name: String,
-        matchers: Vec<Matcher>,
-        start: SystemTime,
-        end: SystemTime,
-        ctx: &QueryContextRef,
-    ) -> Result<Vec<String>> {
-        let table_schema = ctx.current_schema();
-        let table = self
-            .catalog_manager
-            .table(ctx.current_catalog(), &table_schema, &metric, Some(ctx))
-            .await
-            .context(CatalogSnafu)?
-            .with_context(|| TableNotFoundSnafu {
-                table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
-            })?;
-
-        let dataframe = self
-            .query_engine
-            .read_table(table.clone())
-            .with_context(|_| ReadTableSnafu {
-                table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
-            })?;
-
-        let scan_plan = dataframe.into_logical_plan();
-        let filter_conditions =
-            PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
-                .context(PrometheusLabelValuesQueryPlanSnafu)?;
-        let logical_plan = promql::label_values::rewrite_label_values_query(
-            table,
-            scan_plan,
-            filter_conditions,
-            label_name,
-            start,
-            end,
-        )
-        .context(PrometheusLabelValuesQueryPlanSnafu)?;
-
-        let results = self
-            .query_engine
-            .execute(logical_plan, ctx.clone())
-            .await
-            .context(ExecLogicalPlanSnafu)?;
-
-        let batches = match results.data {
-            OutputData::Stream(stream) => util::collect(stream)
-                .await
-                .context(CollectRecordbatchSnafu)?,
-            OutputData::RecordBatches(rbs) => rbs.take(),
-            _ => unreachable!("should not happen"),
-        };
-
-        let mut results = Vec::with_capacity(batches.iter().map(|b| b.num_rows()).sum());
-        for batch in batches {
-            // Only one column the results, ensured by `prometheus::label_values_matchers_to_plan`.
-            let names = batch.column(0);
-
-            for i in 0..names.len() {
-                let Value::String(name) = names.get(i) else {
-                    unreachable!();
-                };
-
-                results.push(name.into_string());
-            }
-        }
-
-        Ok(results)
-    }
 }
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -29,7 +29,6 @@ prost.workspace = true
 puffin.workspace = true
 regex.workspace = true
 regex-automata.workspace = true
-roaring = "0.10"
 serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
--- a/src/index/src/bitmap.rs
+++ b/src/index/src/bitmap.rs
@@ -1,868 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::io;
-use std::ops::RangeInclusive;
-
-use common_base::BitVec;
-/// `BitmapType` enumerates how bitmaps are encoded within the inverted index.
-pub use greptime_proto::v1::index::BitmapType;
-use roaring::RoaringBitmap;
-
-/// A bitmap representation supporting both BitVec and RoaringBitmap formats.
-///
-/// This enum provides unified bitmap operations while allowing efficient storage
-/// in different formats. The implementation automatically handles type conversions
-/// when performing operations between different formats.
-///
-/// # Examples
-///
-/// Creating a new Roaring bitmap:
-/// ```
-/// use bitmap::Bitmap;
-/// let bitmap = Bitmap::new_roaring();
-/// assert!(bitmap.is_empty());
-/// ```
-///
-/// Creating a full BitVec bitmap:
-/// ```
-/// use bitmap::Bitmap;
-/// let bitmap = Bitmap::full_bitvec(10);
-/// assert_eq!(bitmap.count_ones(), 10);
-/// ```
-#[derive(Debug, Clone, PartialEq)]
-pub enum Bitmap {
-    Roaring(RoaringBitmap),
-    BitVec(BitVec),
-}
-
-impl Bitmap {
-    /// Creates a new empty BitVec-based bitmap.
-    pub fn new_bitvec() -> Self {
-        Bitmap::BitVec(BitVec::EMPTY)
-    }
-
-    /// Creates a new empty RoaringBitmap-based bitmap.
-    pub fn new_roaring() -> Self {
-        Bitmap::Roaring(RoaringBitmap::new())
-    }
-
-    /// Creates a full BitVec-based bitmap with all bits set to 1.
-    ///
-    /// # Arguments
-    /// * `size` - The number of bits to allocate and set
-    pub fn full_bitvec(size: usize) -> Self {
-        Bitmap::BitVec(BitVec::repeat(true, size))
-    }
-
-    /// Creates a full RoaringBitmap-based bitmap with bits 0..size set to 1.
-    ///
-    /// # Arguments
-    /// * `size` - The exclusive upper bound for the bit range
-    pub fn full_roaring(size: usize) -> Self {
-        let mut roaring = RoaringBitmap::new();
-        roaring.insert_range(0..size as u32);
-        Bitmap::Roaring(roaring)
-    }
-
-    /// Returns the number of bits set to 1 in the bitmap.
-    pub fn count_ones(&self) -> usize {
-        match self {
-            Bitmap::BitVec(bitvec) => bitvec.count_ones(),
-            Bitmap::Roaring(roaring) => roaring.len() as _,
-        }
-    }
-
-    /// Checks if the bitmap contains no set bits.
-    pub fn is_empty(&self) -> bool {
-        match self {
-            Bitmap::BitVec(bitvec) => bitvec.is_empty(),
-            Bitmap::Roaring(roaring) => roaring.is_empty(),
-        }
-    }
-
-    /// Inserts a range of bits into the bitmap.
-    ///
-    /// # Arguments
-    /// * `range` - Inclusive range of bits to set
-    pub fn insert_range(&mut self, range: RangeInclusive<usize>) {
-        match self {
-            Bitmap::BitVec(bitvec) => {
-                if *range.end() >= bitvec.len() {
-                    bitvec.resize(range.end() + 1, false);
-                }
-                for i in range {
-                    bitvec.set(i, true);
-                }
-            }
-            Bitmap::Roaring(roaring) => {
-                let range = *range.start() as u32..=*range.end() as u32;
-                roaring.insert_range(range);
-            }
-        }
-    }
-
-    /// Serializes the bitmap into a byte buffer using the specified format.
-    ///
-    /// # Arguments
-    /// * `serialize_type` - Target format for serialization
-    /// * `writer` - Output writer to write the serialized data
-    pub fn serialize_into(
-        &self,
-        serialize_type: BitmapType,
-        mut writer: impl io::Write,
-    ) -> io::Result<()> {
-        match (self, serialize_type) {
-            (Bitmap::BitVec(bitvec), BitmapType::BitVec) => {
-                writer.write_all(bitvec.as_raw_slice())?;
-            }
-            (Bitmap::Roaring(roaring), BitmapType::Roaring) => {
-                roaring.serialize_into(writer)?;
-            }
-            (Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
-                let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
-                bitmap.serialize_into(writer)?;
-            }
-            (Bitmap::Roaring(roaring), BitmapType::BitVec) => {
-                let bitvec = Bitmap::roaring_to_bitvec(roaring);
-                writer.write_all(bitvec.as_raw_slice())?;
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Computes the size of the serialized bitmap in bytes.
-    ///
-    /// # Arguments
-    /// * `bitmap_type` - Format of data to be serialized
-    pub fn serialized_size(&self, bitmap_type: BitmapType) -> usize {
-        match (self, bitmap_type) {
-            (Bitmap::BitVec(bitvec), BitmapType::BitVec) => bitvec.as_raw_slice().len(),
-            (Bitmap::Roaring(roaring), BitmapType::Roaring) => roaring.serialized_size(),
-            (Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
-                let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
-                bitmap.serialized_size()
-            }
-            (Bitmap::Roaring(roaring), BitmapType::BitVec) => {
-                let bitvec = Bitmap::roaring_to_bitvec(roaring);
-                bitvec.as_raw_slice().len()
-            }
-        }
-    }
-
-    /// Deserializes a bitmap from a byte buffer.
-    ///
-    /// # Arguments
-    /// * `buf` - Input buffer containing serialized data
-    /// * `bitmap_type` - Format of the serialized data
-    pub fn deserialize_from(buf: &[u8], bitmap_type: BitmapType) -> std::io::Result<Self> {
-        match bitmap_type {
-            BitmapType::BitVec => {
-                let bitvec = BitVec::from_slice(buf);
-                Ok(Bitmap::BitVec(bitvec))
-            }
-            BitmapType::Roaring => {
-                let roaring = RoaringBitmap::deserialize_from(buf)?;
-                Ok(Bitmap::Roaring(roaring))
-            }
-        }
-    }
-
-    /// Computes the union with another bitmap (in-place).
-    ///
-    /// If the other bitmap is a different type, it will be converted to match
-    /// the current bitmap's type.
-    pub fn union(&mut self, other: Self) {
-        if self.is_empty() {
-            *self = other;
-            return;
-        }
-
-        match (self, other) {
-            (Bitmap::BitVec(bitvec1), bitmap) => {
-                let bitvec2 = bitmap.into_bitvec();
-                if bitvec1.len() > bitvec2.len() {
-                    *bitvec1 |= bitvec2
-                } else {
-                    *bitvec1 = bitvec2 | &*bitvec1;
-                }
-            }
-            (Bitmap::Roaring(roaring1), bitmap) => {
-                let roaring2 = bitmap.into_roaring();
-                *roaring1 |= roaring2;
-            }
-        }
-    }
-
-    /// Computes the intersection with another bitmap (in-place).
-    ///
-    /// If the other bitmap is a different type, it will be converted to match
-    /// the current bitmap's type.
-    pub fn intersect(&mut self, other: Self) {
-        match (self, other) {
-            (Bitmap::BitVec(bitvec1), bitmap) => {
-                let mut bitvec2 = bitmap.into_bitvec();
-                let len = (bitvec1.len() - bitvec1.trailing_zeros())
-                    .min(bitvec2.len() - bitvec2.trailing_zeros());
-                bitvec1.truncate(len);
-                bitvec2.truncate(len);
-                *bitvec1 &= bitvec2;
-            }
-            (Bitmap::Roaring(roaring1), bitmap) => {
-                let roaring2 = bitmap.into_roaring();
-                *roaring1 &= roaring2;
-            }
-        }
-    }
-
-    /// Returns an iterator over the indices of set bits.
-    pub fn iter_ones(&self) -> Box<dyn Iterator<Item = usize> + '_> {
-        match self {
-            Bitmap::BitVec(bitvec) => Box::new(bitvec.iter_ones()),
-            Bitmap::Roaring(roaring) => Box::new(roaring.iter().map(|x| x as usize)),
-        }
-    }
-
-    /// Creates a bitmap from bytes in LSB0 (least significant bit first) order.
-    ///
-    /// # Arguments
-    /// * `bytes` - Input bytes in LSB0 order
-    /// * `bitmap_type` - Type of bitmap to create
-    pub fn from_lsb0_bytes(bytes: &[u8], bitmap_type: BitmapType) -> Self {
-        match bitmap_type {
-            BitmapType::BitVec => {
-                let bitvec = BitVec::from_slice(bytes);
-                Bitmap::BitVec(bitvec)
-            }
-            BitmapType::Roaring => {
-                let roaring = RoaringBitmap::from_lsb0_bytes(0, bytes);
-                Bitmap::Roaring(roaring)
-            }
-        }
-    }
-
-    /// Computes memory usage of the bitmap in bytes.
-    pub fn memory_usage(&self) -> usize {
-        match self {
-            Bitmap::BitVec(bitvec) => bitvec.capacity(),
-            Bitmap::Roaring(roaring) => {
-                let stat = roaring.statistics();
-                (stat.n_bytes_array_containers
-                    + stat.n_bytes_bitset_containers
-                    + stat.n_bytes_run_containers) as usize
-            }
-        }
-    }
-
-    fn into_bitvec(self) -> BitVec {
-        match self {
-            Bitmap::BitVec(bitvec) => bitvec,
-            Bitmap::Roaring(roaring) => Self::roaring_to_bitvec(&roaring),
-        }
-    }
-
-    fn into_roaring(self) -> RoaringBitmap {
-        match self {
-            Bitmap::Roaring(roaring) => roaring,
-            Bitmap::BitVec(bitvec) => Self::bitvec_to_roaring(bitvec),
-        }
-    }
-
-    fn roaring_to_bitvec(roaring: &RoaringBitmap) -> BitVec {
-        let max_value = roaring.max().unwrap_or(0);
-        let mut bitvec = BitVec::repeat(false, max_value as usize + 1);
-        for i in roaring {
-            bitvec.set(i as usize, true);
-        }
-        bitvec
-    }
-
-    fn bitvec_to_roaring(mut bitvec: BitVec) -> RoaringBitmap {
-        bitvec.resize(bitvec.capacity(), false);
-        RoaringBitmap::from_lsb0_bytes(0, bitvec.as_raw_slice())
-    }
-}
-
-impl Default for Bitmap {
-    fn default() -> Self {
-        Bitmap::new_roaring()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_full_bitmaps() {
-        let bv = Bitmap::full_bitvec(10);
-        assert_eq!(bv.count_ones(), 10);
-
-        let rb = Bitmap::full_roaring(10);
-        assert_eq!(rb.count_ones(), 10);
-    }
-
-    #[test]
-    fn test_serialization_roundtrip() {
-        let original = Bitmap::full_roaring(100);
-        let mut buf = Vec::new();
-
-        // Serialize as Roaring
-        original
-            .serialize_into(BitmapType::Roaring, &mut buf)
-            .unwrap();
-        let deserialized = Bitmap::deserialize_from(&buf, BitmapType::Roaring).unwrap();
-        assert_eq!(original, deserialized);
-
-        // Serialize as BitVec
-        buf.clear();
-        original
-            .serialize_into(BitmapType::BitVec, &mut buf)
-            .unwrap();
-        let deserialized = Bitmap::deserialize_from(&buf, BitmapType::BitVec).unwrap();
-        assert_eq!(original.count_ones(), deserialized.count_ones());
-    }
-
-    #[test]
-    fn test_union_fulls() {
-        // Test BitVec union
-        let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
-        let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
-        bv1.union(bv2);
-        assert_eq!(bv1.count_ones(), 5);
-
-        let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
-        let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
-        bv1.union(bv2);
-        assert_eq!(bv1.count_ones(), 5);
-
-        // Test Roaring union
-        let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
-        let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
-        rb1.union(rb2);
-        assert_eq!(rb1.count_ones(), 5);
-
-        let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
-        let rb2 = Bitmap::full_roaring(3); // 0-2: 111
-        rb1.union(rb2);
-        assert_eq!(rb1.count_ones(), 5);
-
-        // Test cross-type union
-        let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
-        let bv = Bitmap::full_bitvec(3); // 0-2: 111
-        rb.union(bv);
-        assert_eq!(rb.count_ones(), 5);
-
-        let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
-        let rb = Bitmap::full_roaring(3); // 0-2: 111
-        bv.union(rb);
-        assert_eq!(bv.count_ones(), 5);
-
-        let mut rb = Bitmap::full_roaring(3); // 0-2: 111
-        let bv = Bitmap::full_bitvec(5); // 0-4: 11111
-        rb.union(bv);
-        assert_eq!(rb.count_ones(), 5);
-
-        let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
-        let rb = Bitmap::full_roaring(5); // 0-4: 11111
-        bv.union(rb);
-        assert_eq!(bv.count_ones(), 5);
-    }
-
-    #[test]
-    fn test_union_bitvec() {
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
-        bv1.union(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
-        );
-
-        // Test different lengths
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::BitVec);
-        bv1.union(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
-        );
-
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
-        bv1.union(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
-        );
-
-        // Test empty bitmaps
-        let mut bv1 = Bitmap::new_bitvec();
-        let bv2 = Bitmap::new_bitvec();
-        bv1.union(bv2);
-        assert!(bv1.is_empty());
-
-        let mut bv1 = Bitmap::new_bitvec();
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
-        bv1.union(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
-        );
-
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
-        let bv2 = Bitmap::new_bitvec();
-        bv1.union(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
-        );
-
-        // Test empty and full bitmaps
-        let mut bv1 = Bitmap::new_bitvec();
-        let bv2 = Bitmap::full_bitvec(8);
-        bv1.union(bv2);
-        assert_eq!(bv1, Bitmap::full_bitvec(8));
-
-        let mut bv1 = Bitmap::full_bitvec(8);
-        let bv2 = Bitmap::new_bitvec();
-        bv1.union(bv2);
-        assert_eq!(bv1, Bitmap::full_bitvec(8));
-    }
-
-    #[test]
-    fn test_union_roaring() {
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
-        rb1.union(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
-        );
-
-        // Test different lengths
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::Roaring);
-        rb1.union(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
-        );
-
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
-        rb1.union(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
-        );
-
-        // Test empty bitmaps
-        let mut rb1 = Bitmap::new_roaring();
-        let rb2 = Bitmap::new_roaring();
-        rb1.union(rb2);
-        assert!(rb1.is_empty());
-
-        let mut rb1 = Bitmap::new_roaring();
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
-        rb1.union(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
-        );
-
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
-        let rb2 = Bitmap::new_roaring();
-        rb1.union(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
-        );
-
-        // Test empty and full bit
-        let mut rb1 = Bitmap::new_roaring();
-        let rb2 = Bitmap::full_roaring(8);
-        rb1.union(rb2);
-        assert_eq!(rb1, Bitmap::full_roaring(8));
-
-        let mut rb1 = Bitmap::full_roaring(8);
-        let rb2 = Bitmap::new_roaring();
-        rb1.union(rb2);
-        assert_eq!(rb1, Bitmap::full_roaring(8));
-    }
-
-    #[test]
-    fn test_union_mixed() {
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
-        rb.union(bv);
-        assert_eq!(
-            rb,
-            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
-        );
-
-        let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
-        bv.union(rb);
-        assert_eq!(
-            bv,
-            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
-        );
-
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        let bv = Bitmap::full_bitvec(8);
-        rb.union(bv);
-        assert_eq!(rb, Bitmap::full_roaring(8));
-
-        let mut bv = Bitmap::full_bitvec(8);
-        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        bv.union(rb);
-        assert_eq!(bv, Bitmap::full_bitvec(8));
-
-        let mut rb = Bitmap::new_roaring();
-        let bv = Bitmap::full_bitvec(8);
-        rb.union(bv);
-        assert_eq!(rb, Bitmap::full_bitvec(8));
-
-        let mut bv = Bitmap::full_bitvec(8);
-        let rb = Bitmap::new_roaring();
-        bv.union(rb);
-        assert_eq!(bv, Bitmap::full_bitvec(8));
-
-        let mut rb = Bitmap::new_roaring();
-        let bv = Bitmap::new_bitvec();
-        rb.union(bv);
-        assert!(rb.is_empty());
-
-        let mut bv = Bitmap::new_bitvec();
-        let rb = Bitmap::new_roaring();
-        bv.union(rb);
-        assert!(bv.is_empty());
-
-        let mut rb = Bitmap::new_roaring();
-        let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
-        rb.union(bv);
-        assert_eq!(
-            rb,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
-        );
-
-        let mut bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
-        let rb = Bitmap::new_roaring();
-        bv.union(rb);
-        assert_eq!(
-            bv,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
-        );
-
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
-        let bv = Bitmap::new_bitvec();
-        rb.union(bv);
-        assert_eq!(
-            rb,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
-        );
-
-        let mut bv = Bitmap::new_bitvec();
-        let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
-        bv.union(rb);
-        assert_eq!(
-            bv,
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
-        );
-    }
-
-    #[test]
-    fn test_intersect_fulls() {
-        // Test BitVec intersect
-        let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
-        let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
-        bv1.intersect(bv2);
-        assert_eq!(bv1.count_ones(), 3);
-
-        let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
-        let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
-        bv1.intersect(bv2);
-        assert_eq!(bv1.count_ones(), 3);
-
-        // Test Roaring intersect
-        let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
-        let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
-        rb1.intersect(rb2);
-        assert_eq!(rb1.count_ones(), 3);
-
-        let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
-        let rb2 = Bitmap::full_roaring(3); // 0-2: 111
-        rb1.intersect(rb2);
-        assert_eq!(rb1.count_ones(), 3);
-
-        // Test cross-type intersect
-        let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
-        let bv = Bitmap::full_bitvec(3); // 0-2: 111
-        rb.intersect(bv);
-        assert_eq!(rb.count_ones(), 3);
-
-        let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
-        let rb = Bitmap::full_roaring(3); // 0-2: 111
-        bv.intersect(rb);
-        assert_eq!(bv.count_ones(), 3);
-
-        let mut rb = Bitmap::full_roaring(3); // 0-2: 111
-        let bv = Bitmap::full_bitvec(5); // 0-4: 11111
-        rb.intersect(bv);
-        assert_eq!(rb.count_ones(), 3);
-
-        let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
-        let rb = Bitmap::full_roaring(5); // 0-4: 11111
-        bv.intersect(rb);
-        assert_eq!(bv.count_ones(), 3);
-    }
-
-    #[test]
-    fn test_intersect_bitvec() {
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        bv1.intersect(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
-        );
-
-        // Test different lengths
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
-        bv1.intersect(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
-        );
-
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        bv1.intersect(bv2);
-        assert_eq!(
-            bv1,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
-        );
-
-        // Test empty bitmaps
-        let mut bv1 = Bitmap::new_bitvec();
-        let bv2 = Bitmap::new_bitvec();
-        bv1.intersect(bv2);
-        assert!(bv1.is_empty());
-
-        let mut bv1 = Bitmap::new_bitvec();
-        let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        bv1.intersect(bv2);
-        assert!(bv1.is_empty());
-
-        let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        let bv2 = Bitmap::new_bitvec();
-        bv1.intersect(bv2);
-        assert!(bv1.is_empty());
-
-        // Test empty and full bitmaps
-        let mut bv1 = Bitmap::new_bitvec();
-        let bv2 = Bitmap::full_bitvec(8);
-        bv1.intersect(bv2);
-        assert!(bv1.is_empty());
-
-        let mut bv1 = Bitmap::full_bitvec(8);
-        let bv2 = Bitmap::new_bitvec();
-        bv1.intersect(bv2);
-        assert!(bv1.is_empty());
-    }
-
-    #[test]
-    fn test_intersect_roaring() {
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        rb1.intersect(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
-        );
-
-        // Test different lengths
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
-        rb1.intersect(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
-        );
-
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        rb1.intersect(rb2);
-        assert_eq!(
-            rb1,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
-        );
-
-        // Test empty bitmaps
-        let mut rb1 = Bitmap::new_roaring();
-        let rb2 = Bitmap::new_roaring();
-        rb1.intersect(rb2);
-        assert!(rb1.is_empty());
-
-        let mut rb1 = Bitmap::new_roaring();
-        let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        rb1.intersect(rb2);
-        assert!(rb1.is_empty());
-
-        let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        let rb2 = Bitmap::new_roaring();
-        rb1.intersect(rb2);
-        assert!(rb1.is_empty());
-
-        // Test empty and full bitmaps
-        let mut rb1 = Bitmap::new_roaring();
-        let rb2 = Bitmap::full_roaring(8);
-        rb1.intersect(rb2);
-        assert!(rb1.is_empty());
-
-        let mut rb1 = Bitmap::full_roaring(8);
-        let rb2 = Bitmap::new_roaring();
-        rb1.intersect(rb2);
-        assert!(rb1.is_empty());
-    }
-
-    #[test]
-    fn test_intersect_mixed() {
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
-        let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        rb.intersect(bv);
-        assert_eq!(
-            rb,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
-        );
-
-        let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
-        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        bv.intersect(rb);
-        assert_eq!(
-            bv,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
-        );
-
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
-        let bv = Bitmap::full_bitvec(8);
-        rb.intersect(bv);
-        assert_eq!(
-            rb,
-            Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring)
-        );
-
-        let mut bv = Bitmap::full_bitvec(8);
-        let rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
-        bv.intersect(rb);
-        assert_eq!(
-            bv,
-            Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec)
-        );
-
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
-        let bv = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
-        rb.intersect(bv);
-        assert_eq!(
-            rb,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
-        );
-
-        let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
-        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        bv.intersect(rb);
-        assert_eq!(
-            bv,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
-        );
-
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
-        let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        rb.intersect(bv);
-        assert_eq!(
-            rb,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
-        );
-
-        let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
-        let rb = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
-        bv.intersect(rb);
-        assert_eq!(
-            bv,
-            Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
-        );
-
-        let mut rb = Bitmap::new_roaring();
-        let bv = Bitmap::full_bitvec(8);
-        rb.intersect(bv);
-        assert!(rb.is_empty());
-
-        let mut bv = Bitmap::full_bitvec(8);
-        let rb = Bitmap::new_roaring();
-        bv.intersect(rb);
-        assert!(bv.is_empty());
-
-        let mut bv = Bitmap::new_bitvec();
-        let rb = Bitmap::full_roaring(8);
-        bv.intersect(rb);
-        assert!(bv.is_empty());
-
-        let mut rb = Bitmap::full_roaring(8);
-        let bv = Bitmap::new_bitvec();
-        rb.intersect(bv);
-        assert!(rb.is_empty());
-
-        let mut rb = Bitmap::new_roaring();
-        let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        rb.intersect(bv);
-        assert!(rb.is_empty());
-
-        let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
-        let rb = Bitmap::new_roaring();
-        bv.intersect(rb);
-        assert!(bv.is_empty());
-
-        let mut bv = Bitmap::new_bitvec();
-        let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        bv.intersect(rb);
-        assert!(bv.is_empty());
-
-        let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
-        let bv = Bitmap::new_bitvec();
-        rb.intersect(bv);
-        assert!(rb.is_empty());
-    }
-
-    #[test]
-    fn test_insert_range() {
-        let mut bv = Bitmap::new_bitvec();
-        bv.insert_range(0..=5);
-        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
-
-        let mut rb = Bitmap::new_roaring();
-        rb.insert_range(0..=5);
-        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
-
-        let mut bv = Bitmap::new_bitvec();
-        bv.insert_range(10..=10);
-        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
-
-        let mut rb = Bitmap::new_roaring();
-        rb.insert_range(10..=10);
-        assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
-    }
-}
--- a/src/index/src/inverted_index/create.rs
+++ b/src/index/src/inverted_index/create.rs
@@ -17,7 +17,6 @@ pub mod sort_create;

 use async_trait::async_trait;

-use crate::bitmap::BitmapType;
 use crate::inverted_index::error::Result;
 use crate::inverted_index::format::writer::InvertedIndexWriter;
 use crate::BytesRef;
@@ -54,9 +53,5 @@ pub trait InvertedIndexCreator: Send {

    /// Finalizes the index creation process, ensuring all data is properly indexed and stored
    /// in the provided writer
-    async fn finish(
-        &mut self,
-        writer: &mut dyn InvertedIndexWriter,
-        bitmap_type: BitmapType,
-    ) -> Result<()>;
+    async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()>;
 }
--- a/src/index/src/inverted_index/create/sort.rs
+++ b/src/index/src/inverted_index/create/sort.rs
@@ -17,23 +17,22 @@ mod intermediate_rw;
 mod merge_stream;

 use async_trait::async_trait;
+use common_base::BitVec;
 use futures::Stream;

-use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
-use crate::inverted_index::format::writer::ValueStream;
 use crate::{Bytes, BytesRef};

 /// A stream of sorted values along with their associated bitmap
-pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;
+pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;

 /// Output of a sorting operation, encapsulating a bitmap for null values and a stream of sorted items
 pub struct SortOutput {
    /// Bitmap indicating which segments have null values
-    pub segment_null_bitmap: Bitmap,
+    pub segment_null_bitmap: BitVec,

    /// Stream of sorted items
-    pub sorted_stream: ValueStream,
+    pub sorted_stream: SortedStream,

    /// Total number of rows in the sorted data
    pub total_row_count: usize,
--- a/src/index/src/inverted_index/create/sort/external_sort.rs
+++ b/src/index/src/inverted_index/create/sort/external_sort.rs
@@ -20,11 +20,11 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;

 use async_trait::async_trait;
+use common_base::BitVec;
 use common_telemetry::{debug, error};
 use futures::stream;
 use snafu::ResultExt;

-use crate::bitmap::Bitmap;
 use crate::external_provider::ExternalTempFileProvider;
 use crate::inverted_index::create::sort::intermediate_rw::{
    IntermediateReader, IntermediateWriter,
@@ -45,10 +45,18 @@ pub struct ExternalSorter {
    temp_file_provider: Arc<dyn ExternalTempFileProvider>,

    /// Bitmap indicating which segments have null values
-    segment_null_bitmap: Bitmap,
+    segment_null_bitmap: BitVec,

    /// In-memory buffer to hold values and their corresponding bitmaps until memory threshold is exceeded
-    values_buffer: BTreeMap<Bytes, (Bitmap, usize)>,
+    values_buffer: BTreeMap<Bytes, BitVec>,
+
+    /// Count of rows in the last dumped buffer, used to streamline memory usage of `values_buffer`.
+    ///
+    /// After data is dumped to external files, `last_dump_row_count` is updated to reflect the new starting point
+    /// for `BitVec` indexing. This means each `BitVec` in `values_buffer` thereafter encodes positions relative to
+    /// this count, not from 0. This mechanism effectively shrinks the memory footprint of each `BitVec`, helping manage
+    /// memory use more efficiently by focusing only on newly ingested data post-dump.
+    last_dump_row_count: usize,

    /// Count of all rows ingested so far
    total_row_count: usize,
@@ -85,14 +93,14 @@ impl Sorter for ExternalSorter {
            return Ok(());
        }

-        let segment_index_range = self.segment_index_range(n);
+        let segment_index_range = self.segment_index_range(n, value.is_none());
        self.total_row_count += n;

        if let Some(value) = value {
            let memory_diff = self.push_not_null(value, segment_index_range);
            self.may_dump_buffer(memory_diff).await
        } else {
-            self.segment_null_bitmap.insert_range(segment_index_range);
+            set_bits(&mut self.segment_null_bitmap, segment_index_range);
            Ok(())
        }
    }
@@ -109,10 +117,15 @@ impl Sorter for ExternalSorter {
        // TODO(zhongzc): k-way merge instead of 2-way merge

        let mut tree_nodes: VecDeque<SortedStream> = VecDeque::with_capacity(readers.len() + 1);
+        let leading_zeros = self.last_dump_row_count / self.segment_row_count;
        tree_nodes.push_back(Box::new(stream::iter(
            mem::take(&mut self.values_buffer)
                .into_iter()
-                .map(|(value, (bitmap, _))| Ok((value, bitmap))),
+                .map(move |(value, mut bitmap)| {
+                    bitmap.resize(bitmap.len() + leading_zeros, false);
+                    bitmap.shift_right(leading_zeros);
+                    Ok((value, bitmap))
+                }),
        )));
        for (_, reader) in readers {
            tree_nodes.push_back(IntermediateReader::new(reader).into_stream().await?);
@@ -148,10 +161,11 @@ impl ExternalSorter {
            index_name,
            temp_file_provider,

-            segment_null_bitmap: Bitmap::new_bitvec(), // bitvec is more efficient for many null values
+            segment_null_bitmap: BitVec::new(),
            values_buffer: BTreeMap::new(),

            total_row_count: 0,
+            last_dump_row_count: 0,
            segment_row_count,

            current_memory_usage: 0,
@@ -181,7 +195,7 @@ impl ExternalSorter {
    }

    /// Pushes the non-null values to the values buffer and sets the bits within
-    /// the specified range in the given bitmap to true.
+    /// the specified range in the given BitVec to true.
    /// Returns the memory usage difference of the buffer after the operation.
    fn push_not_null(
        &mut self,
@@ -189,23 +203,20 @@ impl ExternalSorter {
        segment_index_range: RangeInclusive<usize>,
    ) -> usize {
        match self.values_buffer.get_mut(value) {
-            Some((bitmap, mem_usage)) => {
-                bitmap.insert_range(segment_index_range);
-                let new_usage = bitmap.memory_usage() + value.len();
-                let diff = new_usage - *mem_usage;
-                *mem_usage = new_usage;
+            Some(bitmap) => {
+                let old_len = bitmap.as_raw_slice().len();
+                set_bits(bitmap, segment_index_range);

-                diff
+                bitmap.as_raw_slice().len() - old_len
            }
            None => {
-                let mut bitmap = Bitmap::new_roaring();
-                bitmap.insert_range(segment_index_range);
+                let mut bitmap = BitVec::default();
+                set_bits(&mut bitmap, segment_index_range);

-                let mem_usage = bitmap.memory_usage() + value.len();
-                self.values_buffer
-                    .insert(value.to_vec(), (bitmap, mem_usage));
+                let mem_diff = bitmap.as_raw_slice().len() + value.len();
+                self.values_buffer.insert(value.to_vec(), bitmap);

-                mem_usage
+                mem_diff
            }
        }
    }
@@ -246,8 +257,12 @@ impl ExternalSorter {
            .fetch_sub(memory_usage, Ordering::Relaxed);
        self.current_memory_usage = 0;

+        let bitmap_leading_zeros = self.last_dump_row_count / self.segment_row_count;
+        self.last_dump_row_count =
+            self.total_row_count - self.total_row_count % self.segment_row_count; // align to segment
+
        let entries = values.len();
-        IntermediateWriter::new(writer).write_all(values.into_iter().map(|(k, (b, _))| (k, b))).await.inspect(|_|
+        IntermediateWriter::new(writer).write_all(values, bitmap_leading_zeros as _).await.inspect(|_|
            debug!("Dumped {entries} entries ({memory_usage} bytes) to intermediate file {file_id} for index {index_name}")
        ).inspect_err(|e|
            error!(e; "Failed to dump {entries} entries to intermediate file {file_id} for index {index_name}")
@@ -256,8 +271,13 @@ impl ExternalSorter {

    /// Determines the segment index range for the row index range
    /// `[row_begin, row_begin + n - 1]`
-    fn segment_index_range(&self, n: usize) -> RangeInclusive<usize> {
-        let row_begin = self.total_row_count;
+    fn segment_index_range(&self, n: usize, is_null: bool) -> RangeInclusive<usize> {
+        let row_begin = if is_null {
+            self.total_row_count
+        } else {
+            self.total_row_count - self.last_dump_row_count
+        };
+
        let start = self.segment_index(row_begin);
        let end = self.segment_index(row_begin + n - 1);
        start..=end
@@ -269,6 +289,16 @@ impl ExternalSorter {
    }
 }

+/// Sets the bits within the specified range in the given `BitVec` to true
+fn set_bits(bitmap: &mut BitVec, index_range: RangeInclusive<usize>) {
+    if *index_range.end() >= bitmap.len() {
+        bitmap.resize(index_range.end() + 1, false);
+    }
+    for index in index_range {
+        bitmap.set(index, true);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::collections::HashMap;
@@ -300,7 +330,7 @@ mod tests {
            move |index_name, file_id| {
                assert_eq!(index_name, "test");
                let mut files = files.lock().unwrap();
-                let (writer, reader) = duplex(1024 * 1024);
+                let (writer, reader) = duplex(8 * 1024);
                files.insert(file_id.to_string(), Box::new(reader.compat()));
                Ok(Box::new(writer.compat_write()))
            }
--- a/src/index/src/inverted_index/create/sort/intermediate_rw.rs
+++ b/src/index/src/inverted_index/create/sort/intermediate_rw.rs
@@ -19,24 +19,29 @@
 //! The serialization format is as follows:
 //!
 //! ```text
-//! [magic][item][item]...[item]
-//!    [4]       [?]
+//! [magic][bitmap leading zeros][item][item]...[item]
+//!    [4]          [4]              [?]
 //!
 //! Each [item] is structured as:
 //! [value len][value][bitmap len][bitmap]
 //!     [8]       [?]       [8]        [?]
 //! ```
 //!
-//! Each item represents a value and its associated bitmap, serialized with their lengths for
+//! The format starts with a 4-byte magic identifier, followed by a 4-byte
+//! bitmap leading zeros count, indicating how many leading zeros are in the
+//! fixed-size region of the bitmap. Following that, each item represents
+//! a value and its associated bitmap, serialized with their lengths for
 //! easier deserialization.

 mod codec_v1;

+use std::collections::BTreeMap;
+
 use asynchronous_codec::{FramedRead, FramedWrite};
+use common_base::BitVec;
 use futures::{stream, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, StreamExt};
 use snafu::ResultExt;

-use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::create::sort::SortedStream;
 use crate::inverted_index::error::{
    CloseSnafu, FlushSnafu, ReadSnafu, Result, UnknownIntermediateCodecMagicSnafu, WriteSnafu,
@@ -57,13 +62,12 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
    /// Serializes and writes all provided values to the wrapped writer
    pub async fn write_all(
        mut self,
-        values: impl IntoIterator<Item = (Bytes, Bitmap)>,
+        values: BTreeMap<Bytes, BitVec>,
+        bitmap_leading_zeros: u32,
    ) -> Result<()> {
        let (codec_magic, encoder) = (
            codec_v1::CODEC_V1_MAGIC,
-            codec_v1::IntermediateItemEncoderV1 {
-                bitmap_type: BitmapType::Roaring,
-            },
+            codec_v1::IntermediateItemEncoderV1,
        );

        self.writer
@@ -71,6 +75,11 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
            .await
            .context(WriteSnafu)?;

+        self.writer
+            .write_all(&bitmap_leading_zeros.to_be_bytes())
+            .await
+            .context(WriteSnafu)?;
+
        let value_stream = stream::iter(values.into_iter().map(Ok));
        let frame_write = FramedWrite::new(&mut self.writer, encoder);
        // `forward()` will flush and close the writer when the stream ends
@@ -103,9 +112,17 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {
            .context(ReadSnafu)?;

        let decoder = match &magic {
-            codec_v1::CODEC_V1_MAGIC => codec_v1::IntermediateItemDecoderV1 {
-                bitmap_type: BitmapType::Roaring,
-            },
+            codec_v1::CODEC_V1_MAGIC => {
+                let bitmap_leading_zeros = {
+                    let mut buf = [0u8; 4];
+                    self.reader.read_exact(&mut buf).await.context(ReadSnafu)?;
+                    u32::from_be_bytes(buf)
+                };
+
+                codec_v1::IntermediateItemDecoderV1 {
+                    bitmap_leading_zeros,
+                }
+            }
            _ => return UnknownIntermediateCodecMagicSnafu { magic }.fail(),
        };

@@ -115,7 +132,6 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {

 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
    use std::io::{Seek, SeekFrom};

    use futures::io::{AllowStdIo, Cursor};
@@ -124,10 +140,6 @@ mod tests {
    use super::*;
    use crate::inverted_index::error::Error;

-    fn bitmap(bytes: &[u8]) -> Bitmap {
-        Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
-    }
-
    #[tokio::test]
    async fn test_intermediate_read_write_basic() {
        let file_r = tempfile().unwrap();
@@ -136,12 +148,12 @@ mod tests {
        let buf_w = AllowStdIo::new(file_w);

        let values = BTreeMap::from_iter([
-            (Bytes::from("a"), bitmap(&[0b10101010])),
-            (Bytes::from("b"), bitmap(&[0b01010101])),
+            (Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
+            (Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
        ]);

        let writer = IntermediateWriter::new(buf_w);
-        writer.write_all(values.clone()).await.unwrap();
+        writer.write_all(values.clone(), 0).await.unwrap();
        // reset the handle
        buf_r.seek(SeekFrom::Start(0)).unwrap();

@@ -149,9 +161,48 @@ mod tests {
        let mut stream = reader.into_stream().await.unwrap();

        let a = stream.next().await.unwrap().unwrap();
-        assert_eq!(a, (Bytes::from("a"), bitmap(&[0b10101010])));
+        assert_eq!(a, (Bytes::from("a"), BitVec::from_slice(&[0b10101010])));
        let b = stream.next().await.unwrap().unwrap();
-        assert_eq!(b, (Bytes::from("b"), bitmap(&[0b01010101])));
+        assert_eq!(b, (Bytes::from("b"), BitVec::from_slice(&[0b01010101])));
+        assert!(stream.next().await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_intermediate_read_write_with_prefix_zeros() {
+        let file_r = tempfile().unwrap();
+        let file_w = file_r.try_clone().unwrap();
+        let mut buf_r = AllowStdIo::new(file_r);
+        let buf_w = AllowStdIo::new(file_w);
+
+        let values = BTreeMap::from_iter([
+            (Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
+            (Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
+        ]);
+
+        let writer = IntermediateWriter::new(buf_w);
+        writer.write_all(values.clone(), 8).await.unwrap();
+        // reset the handle
+        buf_r.seek(SeekFrom::Start(0)).unwrap();
+
+        let reader = IntermediateReader::new(buf_r);
+        let mut stream = reader.into_stream().await.unwrap();
+
+        let a = stream.next().await.unwrap().unwrap();
+        assert_eq!(
+            a,
+            (
+                Bytes::from("a"),
+                BitVec::from_slice(&[0b00000000, 0b10101010])
+            )
+        );
+        let b = stream.next().await.unwrap().unwrap();
+        assert_eq!(
+            b,
+            (
+                Bytes::from("b"),
+                BitVec::from_slice(&[0b00000000, 0b01010101])
+            )
+        );
        assert!(stream.next().await.is_none());
    }

@@ -162,7 +213,7 @@ mod tests {
        let values = BTreeMap::new();

        let writer = IntermediateWriter::new(&mut buf);
-        writer.write_all(values.clone()).await.unwrap();
+        writer.write_all(values.clone(), 0).await.unwrap();

        let reader = IntermediateReader::new(Cursor::new(buf));
        let mut stream = reader.into_stream().await.unwrap();
--- a/src/index/src/inverted_index/create/sort/intermediate_rw/codec_v1.rs
+++ b/src/index/src/inverted_index/create/sort/intermediate_rw/codec_v1.rs
@@ -16,10 +16,9 @@ use std::io;

 use asynchronous_codec::{BytesMut, Decoder, Encoder};
 use bytes::{Buf, BufMut};
-use greptime_proto::v1::index::BitmapType;
+use common_base::BitVec;
 use snafu::ResultExt;

-use crate::bitmap::Bitmap;
 use crate::inverted_index::error::{CommonIoSnafu, Error, Result};
 use crate::Bytes;

@@ -29,42 +28,37 @@ const U64_LENGTH: usize = std::mem::size_of::<u64>();
 pub const CODEC_V1_MAGIC: &[u8; 4] = b"im01";

 /// Serializes items of external sorting intermediate files.
-pub struct IntermediateItemEncoderV1 {
-    pub bitmap_type: BitmapType,
-}
+pub struct IntermediateItemEncoderV1;

 /// [`FramedWrite`] requires the [`Encoder`] trait to be implemented.
 impl Encoder for IntermediateItemEncoderV1 {
-    type Item<'a> = (Bytes, Bitmap);
+    type Item<'a> = (Bytes, BitVec);
    type Error = Error;

-    fn encode(&mut self, item: (Bytes, Bitmap), dst: &mut BytesMut) -> Result<()> {
+    fn encode(&mut self, item: (Bytes, BitVec), dst: &mut BytesMut) -> Result<()> {
        let value_bytes = item.0;
-        let bitmap_size = item.1.serialized_size(self.bitmap_type);
+        let bitmap_bytes = item.1.into_vec();

-        dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_size);
+        dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_bytes.len());
        dst.put_u64_le(value_bytes.len() as u64);
        dst.extend_from_slice(&value_bytes);
-        dst.put_u64_le(bitmap_size as u64);
-        item.1
-            .serialize_into(self.bitmap_type, &mut dst.writer())
-            .context(CommonIoSnafu)?;
-
+        dst.put_u64_le(bitmap_bytes.len() as u64);
+        dst.extend_from_slice(&bitmap_bytes);
        Ok(())
    }
 }

 /// Deserializes items of external sorting intermediate files.
 pub struct IntermediateItemDecoderV1 {
-    pub bitmap_type: BitmapType,
+    pub(crate) bitmap_leading_zeros: u32,
 }

 /// [`FramedRead`] requires the [`Decoder`] trait to be implemented.
 impl Decoder for IntermediateItemDecoderV1 {
-    type Item = (Bytes, Bitmap);
+    type Item = (Bytes, BitVec);
    type Error = Error;

-    /// Decodes the `src` into `(Bytes, RoaringBitmap)`. Returns `None` if
+    /// Decodes the `src` into `(Bytes, BitVec)`. Returns `None` if
    /// the `src` does not contain enough data for a complete item.
    ///
    /// Only after successful decoding, the `src` is advanced. Otherwise,
@@ -98,8 +92,8 @@ impl Decoder for IntermediateItemDecoderV1 {
            return Ok(None);
        }

-        let bitmap = Bitmap::deserialize_from(&buf[..bitmap_len], self.bitmap_type)
-            .context(CommonIoSnafu)?;
+        let mut bitmap = BitVec::repeat(false, self.bitmap_leading_zeros as _);
+        bitmap.extend_from_raw_slice(&buf[..bitmap_len]);

        let item = (value_bytes.to_vec(), bitmap);

@@ -119,29 +113,25 @@ impl From<io::Error> for Error {

 #[cfg(test)]
 mod tests {
-    use super::*;
+    use common_base::bit_vec::prelude::{bitvec, Lsb0};

-    fn bitmap(bytes: &[u8]) -> Bitmap {
-        Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
-    }
+    use super::*;

    #[test]
    fn test_intermediate_codec_basic() {
-        let mut encoder = IntermediateItemEncoderV1 {
-            bitmap_type: BitmapType::Roaring,
-        };
+        let mut encoder = IntermediateItemEncoderV1;
        let mut buf = BytesMut::new();

-        let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
+        let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
        encoder.encode(item.clone(), &mut buf).unwrap();

        let mut decoder = IntermediateItemDecoderV1 {
-            bitmap_type: BitmapType::Roaring,
+            bitmap_leading_zeros: 0,
        };
        assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
        assert_eq!(decoder.decode(&mut buf).unwrap(), None);

-        let item1 = (b"world".to_vec(), bitmap(&[0b01010101]));
+        let item1 = (b"world".to_vec(), BitVec::from_slice(&[0b01010101]));
        encoder.encode(item.clone(), &mut buf).unwrap();
        encoder.encode(item1.clone(), &mut buf).unwrap();
        assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
@@ -152,16 +142,14 @@ mod tests {

    #[test]
    fn test_intermediate_codec_empty_item() {
-        let mut encoder = IntermediateItemEncoderV1 {
-            bitmap_type: BitmapType::Roaring,
-        };
+        let mut encoder = IntermediateItemEncoderV1;
        let mut buf = BytesMut::new();

-        let item = (b"".to_vec(), bitmap(&[]));
+        let item = (b"".to_vec(), BitVec::from_slice(&[]));
        encoder.encode(item.clone(), &mut buf).unwrap();

        let mut decoder = IntermediateItemDecoderV1 {
-            bitmap_type: BitmapType::Roaring,
+            bitmap_leading_zeros: 0,
        };
        assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
        assert_eq!(decoder.decode(&mut buf).unwrap(), None);
@@ -170,19 +158,17 @@ mod tests {

    #[test]
    fn test_intermediate_codec_partial() {
-        let mut encoder = IntermediateItemEncoderV1 {
-            bitmap_type: BitmapType::Roaring,
-        };
+        let mut encoder = IntermediateItemEncoderV1;
        let mut buf = BytesMut::new();

-        let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
+        let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
        encoder.encode(item.clone(), &mut buf).unwrap();

        let partial_length = U64_LENGTH + 3;
        let mut partial_bytes = buf.split_to(partial_length);

        let mut decoder = IntermediateItemDecoderV1 {
-            bitmap_type: BitmapType::Roaring,
+            bitmap_leading_zeros: 0,
        };
        assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None); // not enough data
        partial_bytes.extend_from_slice(&buf[..]);
@@ -190,4 +176,25 @@ mod tests {
        assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None);
        assert!(partial_bytes.is_empty());
    }
+
+    #[test]
+    fn test_intermediate_codec_prefix_zeros() {
+        let mut encoder = IntermediateItemEncoderV1;
+        let mut buf = BytesMut::new();
+
+        let item = (b"hello".to_vec(), bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]);
+        encoder.encode(item.clone(), &mut buf).unwrap();
+
+        let mut decoder = IntermediateItemDecoderV1 {
+            bitmap_leading_zeros: 3,
+        };
+        let decoded_item = decoder.decode(&mut buf).unwrap().unwrap();
+        assert_eq!(decoded_item.0, b"hello");
+        assert_eq!(
+            decoded_item.1,
+            bitvec![u8, Lsb0; 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+        );
+        assert_eq!(decoder.decode(&mut buf).unwrap(), None);
+        assert!(buf.is_empty());
+    }
 }
--- a/src/index/src/inverted_index/create/sort/merge_stream.rs
+++ b/src/index/src/inverted_index/create/sort/merge_stream.rs
@@ -16,10 +16,10 @@ use std::cmp::Ordering;
 use std::pin::Pin;
 use std::task::{Context, Poll};

+use common_base::BitVec;
 use futures::{ready, Stream, StreamExt};
 use pin_project::pin_project;

-use crate::bitmap::Bitmap;
 use crate::inverted_index::create::sort::SortedStream;
 use crate::inverted_index::error::Result;
 use crate::Bytes;
@@ -28,10 +28,10 @@ use crate::Bytes;
 #[pin_project]
 pub struct MergeSortedStream {
    stream1: Option<SortedStream>,
-    peek1: Option<(Bytes, Bitmap)>,
+    peek1: Option<(Bytes, BitVec)>,

    stream2: Option<SortedStream>,
-    peek2: Option<(Bytes, Bitmap)>,
+    peek2: Option<(Bytes, BitVec)>,
 }

 impl MergeSortedStream {
@@ -49,7 +49,7 @@ impl MergeSortedStream {
 }

 impl Stream for MergeSortedStream {
-    type Item = Result<(Bytes, Bitmap)>;
+    type Item = Result<(Bytes, BitVec)>;

    /// Polls both streams and returns the next item from the stream that has the smaller next item.
    /// If both streams have the same next item, the bitmaps are unioned together.
@@ -89,77 +89,77 @@ impl Stream for MergeSortedStream {
 }

 /// Merges two bitmaps by bit-wise OR'ing them together, preserving all bits from both
-fn merge_bitmaps(mut bitmap1: Bitmap, bitmap2: Bitmap) -> Bitmap {
-    bitmap1.union(bitmap2);
-    bitmap1
+fn merge_bitmaps(bitmap1: BitVec, bitmap2: BitVec) -> BitVec {
+    // make sure longer bitmap is on the left to avoid truncation
+    #[allow(clippy::if_same_then_else)]
+    if bitmap1.len() > bitmap2.len() {
+        bitmap1 | bitmap2
+    } else {
+        bitmap2 | bitmap1
+    }
 }

 #[cfg(test)]
 mod tests {
    use futures::stream;
-    use greptime_proto::v1::index::BitmapType;

    use super::*;
    use crate::inverted_index::error::Error;

-    fn bitmap(bytes: &[u8]) -> Bitmap {
-        Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
-    }
-
-    fn sorted_stream_from_vec(vec: Vec<(Bytes, Bitmap)>) -> SortedStream {
+    fn sorted_stream_from_vec(vec: Vec<(Bytes, BitVec)>) -> SortedStream {
        Box::new(stream::iter(vec.into_iter().map(Ok::<_, Error>)))
    }

    #[tokio::test]
    async fn test_merge_sorted_stream_non_overlapping() {
        let stream1 = sorted_stream_from_vec(vec![
-            (Bytes::from("apple"), bitmap(&[0b10101010])),
-            (Bytes::from("orange"), bitmap(&[0b01010101])),
+            (Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
+            (Bytes::from("orange"), BitVec::from_slice(&[0b01010101])),
        ]);
        let stream2 = sorted_stream_from_vec(vec![
-            (Bytes::from("banana"), bitmap(&[0b10101010])),
-            (Bytes::from("peach"), bitmap(&[0b01010101])),
+            (Bytes::from("banana"), BitVec::from_slice(&[0b10101010])),
+            (Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
        ]);

        let mut merged_stream = MergeSortedStream::merge(stream1, stream2);

        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("apple"));
-        assert_eq!(item.1, bitmap(&[0b10101010]));
+        assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("banana"));
-        assert_eq!(item.1, bitmap(&[0b10101010]));
+        assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("orange"));
-        assert_eq!(item.1, bitmap(&[0b01010101]));
+        assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("peach"));
-        assert_eq!(item.1, bitmap(&[0b01010101]));
+        assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
        assert!(merged_stream.next().await.is_none());
    }

    #[tokio::test]
    async fn test_merge_sorted_stream_overlapping() {
        let stream1 = sorted_stream_from_vec(vec![
-            (Bytes::from("apple"), bitmap(&[0b10101010])),
-            (Bytes::from("orange"), bitmap(&[0b10101010])),
+            (Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
+            (Bytes::from("orange"), BitVec::from_slice(&[0b10101010])),
        ]);
        let stream2 = sorted_stream_from_vec(vec![
-            (Bytes::from("apple"), bitmap(&[0b01010101])),
-            (Bytes::from("peach"), bitmap(&[0b01010101])),
+            (Bytes::from("apple"), BitVec::from_slice(&[0b01010101])),
+            (Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
        ]);

        let mut merged_stream = MergeSortedStream::merge(stream1, stream2);

        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("apple"));
-        assert_eq!(item.1, bitmap(&[0b11111111]));
+        assert_eq!(item.1, BitVec::from_slice(&[0b11111111]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("orange"));
-        assert_eq!(item.1, bitmap(&[0b10101010]));
+        assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
        let item = merged_stream.next().await.unwrap().unwrap();
        assert_eq!(item.0, Bytes::from("peach"));
-        assert_eq!(item.1, bitmap(&[0b01010101]));
+        assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
        assert!(merged_stream.next().await.is_none());
    }

--- a/src/index/src/inverted_index/create/sort_create.rs
+++ b/src/index/src/inverted_index/create/sort_create.rs
@@ -18,7 +18,6 @@ use std::num::NonZeroUsize;
 use async_trait::async_trait;
 use snafu::ensure;

-use crate::bitmap::BitmapType;
 use crate::inverted_index::create::sort::{SortOutput, Sorter};
 use crate::inverted_index::create::InvertedIndexCreator;
 use crate::inverted_index::error::{InconsistentRowCountSnafu, Result};
@@ -69,11 +68,7 @@ impl InvertedIndexCreator for SortIndexCreator {
    }

    /// Finalizes the sorting for all indexes and writes them using the inverted index writer
-    async fn finish(
-        &mut self,
-        writer: &mut dyn InvertedIndexWriter,
-        bitmap_type: BitmapType,
-    ) -> Result<()> {
+    async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()> {
        let mut output_row_count = None;
        for (index_name, mut sorter) in self.sorters.drain() {
            let SortOutput {
@@ -93,7 +88,7 @@ impl InvertedIndexCreator for SortIndexCreator {
            );

            writer
-                .add_index(index_name, segment_null_bitmap, sorted_stream, bitmap_type)
+                .add_index(index_name, segment_null_bitmap, sorted_stream)
                .await?;
        }

@@ -122,9 +117,9 @@ mod tests {
    use futures::{stream, StreamExt};

    use super::*;
-    use crate::bitmap::Bitmap;
+    use crate::inverted_index::create::sort::SortedStream;
    use crate::inverted_index::error::Error;
-    use crate::inverted_index::format::writer::{MockInvertedIndexWriter, ValueStream};
+    use crate::inverted_index::format::writer::MockInvertedIndexWriter;
    use crate::Bytes;

    #[tokio::test]
@@ -148,10 +143,11 @@ mod tests {
        }

        let mut mock_writer = MockInvertedIndexWriter::new();
-        mock_writer.expect_add_index().times(3).returning(
-            |name, null_bitmap, stream, bitmap_type| {
+        mock_writer
+            .expect_add_index()
+            .times(3)
+            .returning(|name, null_bitmap, stream| {
                assert!(null_bitmap.is_empty());
-                assert_eq!(bitmap_type, BitmapType::Roaring);
                match name.as_str() {
                    "a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
                    "b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
@@ -159,8 +155,7 @@ mod tests {
                    _ => panic!("unexpected index name: {}", name),
                }
                Ok(())
-            },
-        );
+            });
        mock_writer
            .expect_finish()
            .times(1)
@@ -170,10 +165,7 @@ mod tests {
                Ok(())
            });

-        creator
-            .finish(&mut mock_writer, BitmapType::Roaring)
-            .await
-            .unwrap();
+        creator.finish(&mut mock_writer).await.unwrap();
    }

    #[tokio::test]
@@ -199,9 +191,8 @@ mod tests {
        let mut mock_writer = MockInvertedIndexWriter::new();
        mock_writer
            .expect_add_index()
-            .returning(|name, null_bitmap, stream, bitmap_type| {
+            .returning(|name, null_bitmap, stream| {
                assert!(null_bitmap.is_empty());
-                assert_eq!(bitmap_type, BitmapType::Roaring);
                match name.as_str() {
                    "a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
                    "b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
@@ -212,7 +203,7 @@ mod tests {
            });
        mock_writer.expect_finish().never();

-        let res = creator.finish(&mut mock_writer, BitmapType::Roaring).await;
+        let res = creator.finish(&mut mock_writer).await;
        assert!(matches!(res, Err(Error::InconsistentRowCount { .. })));
    }

@@ -228,9 +219,8 @@ mod tests {
        let mut mock_writer = MockInvertedIndexWriter::new();
        mock_writer
            .expect_add_index()
-            .returning(|name, null_bitmap, stream, bitmap_type| {
+            .returning(|name, null_bitmap, stream| {
                assert!(null_bitmap.is_empty());
-                assert_eq!(bitmap_type, BitmapType::Roaring);
                assert!(matches!(name.as_str(), "a" | "b" | "c"));
                assert!(stream_to_values(stream).is_empty());
                Ok(())
@@ -244,10 +234,7 @@ mod tests {
                Ok(())
            });

-        creator
-            .finish(&mut mock_writer, BitmapType::Roaring)
-            .await
-            .unwrap();
+        creator.finish(&mut mock_writer).await.unwrap();
    }

    fn set_bit(bit_vec: &mut BitVec, index: usize) {
@@ -296,21 +283,20 @@ mod tests {

        async fn output(&mut self) -> Result<SortOutput> {
            let segment_null_bitmap = self.values.remove(&None).unwrap_or_default();
-            let segment_null_bitmap = Bitmap::BitVec(segment_null_bitmap);

            Ok(SortOutput {
                segment_null_bitmap,
                sorted_stream: Box::new(stream::iter(
                    std::mem::take(&mut self.values)
                        .into_iter()
-                        .map(|(v, b)| Ok((v.unwrap(), Bitmap::BitVec(b)))),
+                        .map(|(v, b)| Ok((v.unwrap(), b))),
                )),
                total_row_count: self.total_row_count,
            })
        }
    }

-    fn stream_to_values(stream: ValueStream) -> Vec<Bytes> {
+    fn stream_to_values(stream: SortedStream) -> Vec<Bytes> {
        futures::executor::block_on(async {
            stream.map(|r| r.unwrap().0).collect::<Vec<Bytes>>().await
        })
--- a/src/index/src/inverted_index/error.rs
+++ b/src/index/src/inverted_index/error.rs
@@ -110,14 +110,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to decode bitmap"))]
-    DecodeBitmap {
-        #[snafu(source)]
-        error: IoError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to decode protobuf"))]
    DecodeProto {
        #[snafu(source)]
@@ -248,7 +240,6 @@ impl ErrorExt for Error {
            | CommonIo { .. }
            | UnknownIntermediateCodecMagic { .. }
            | FstCompile { .. }
-            | DecodeBitmap { .. }
            | InvalidFooterPayloadSize { .. }
            | BlobSizeTooSmall { .. } => StatusCode::Unexpected,

--- a/src/index/src/inverted_index/format/reader.rs
+++ b/src/index/src/inverted_index/format/reader.rs
@@ -18,11 +18,11 @@ use std::sync::Arc;

 use async_trait::async_trait;
 use bytes::Bytes;
+use common_base::BitVec;
 use greptime_proto::v1::index::InvertedIndexMetas;
 use snafu::ResultExt;

-use crate::bitmap::{Bitmap, BitmapType};
-use crate::inverted_index::error::{DecodeBitmapSnafu, DecodeFstSnafu, Result};
+use crate::inverted_index::error::{DecodeFstSnafu, Result};
 pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
 use crate::inverted_index::FstMap;

@@ -67,25 +67,17 @@ pub trait InvertedIndexReader: Send + Sync {
    }

    /// Retrieves the bitmap from the given offset and size.
-    async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
-        self.range_read(offset, size).await.and_then(|bytes| {
-            Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
-        })
+    async fn bitmap(&self, offset: u64, size: u32) -> Result<BitVec> {
+        self.range_read(offset, size).await.map(BitVec::from_vec)
    }

    /// Retrieves the multiple bitmaps from the given ranges.
-    async fn bitmap_deque(
-        &mut self,
-        ranges: &[(Range<u64>, BitmapType)],
-    ) -> Result<VecDeque<Bitmap>> {
-        let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
-        let bytes = self.read_vec(&ranges).await?;
-        bytes
+    async fn bitmap_deque(&mut self, ranges: &[Range<u64>]) -> Result<VecDeque<BitVec>> {
+        Ok(self
+            .read_vec(ranges)
+            .await?
            .into_iter()
-            .zip(types)
-            .map(|(bytes, bitmap_type)| {
-                Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
-            })
-            .collect::<Result<VecDeque<_>>>()
+            .map(|bytes| BitVec::from_slice(bytes.as_ref()))
+            .collect::<VecDeque<_>>())
    }
 }
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -78,14 +78,14 @@ impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {

 #[cfg(test)]
 mod tests {
+    use common_base::bit_vec::prelude::*;
    use fst::MapBuilder;
-    use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta, InvertedIndexMetas};
+    use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
    use prost::Message;

    use super::*;
-    use crate::bitmap::Bitmap;

-    fn mock_fst() -> Vec<u8> {
+    fn create_fake_fst() -> Vec<u8> {
        let mut fst_buf = Vec::new();
        let mut build = MapBuilder::new(&mut fst_buf).unwrap();
        build.insert("key1".as_bytes(), 1).unwrap();
@@ -94,27 +94,19 @@ mod tests {
        fst_buf
    }

-    fn mock_bitmap() -> Bitmap {
-        Bitmap::from_lsb0_bytes(&[0b10101010, 0b10000000], BitmapType::Roaring)
-    }
-
-    fn mock_bitmap_bytes() -> Vec<u8> {
-        let mut buf = Vec::new();
-        mock_bitmap()
-            .serialize_into(BitmapType::Roaring, &mut buf)
-            .unwrap();
-        buf
+    fn create_fake_bitmap() -> Vec<u8> {
+        bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0, 1, 0].into_vec()
    }

    fn create_inverted_index_blob() -> Vec<u8> {
-        let bitmap_size = mock_bitmap_bytes().len();
-        let fst_size = mock_fst().len();
+        let bitmap_size = create_fake_bitmap().len();
+        let fst_size = create_fake_fst().len();

        // first index
        let mut inverted_index = Vec::new();
-        inverted_index.extend_from_slice(&mock_bitmap_bytes()); // value bitmap
-        inverted_index.extend_from_slice(&mock_bitmap_bytes()); // null bitmap
-        inverted_index.extend_from_slice(&mock_fst()); // fst
+        inverted_index.extend_from_slice(&create_fake_bitmap()); // value bitmap
+        inverted_index.extend_from_slice(&create_fake_bitmap()); // null bitmap
+        inverted_index.extend_from_slice(&create_fake_fst()); // fst

        let meta = InvertedIndexMeta {
            name: "tag0".to_string(),
@@ -124,7 +116,6 @@ mod tests {
            null_bitmap_size: bitmap_size as _,
            relative_fst_offset: (bitmap_size * 2) as _,
            fst_size: fst_size as _,
-            bitmap_type: BitmapType::Roaring as _,
            ..Default::default()
        };

@@ -137,7 +128,6 @@ mod tests {
            null_bitmap_size: bitmap_size as _,
            relative_fst_offset: (bitmap_size * 2) as _,
            fst_size: fst_size as _,
-            bitmap_type: BitmapType::Roaring as _,
            ..Default::default()
        };

@@ -178,19 +168,19 @@ mod tests {
        let meta0 = metas.metas.get("tag0").unwrap();
        assert_eq!(meta0.name, "tag0");
        assert_eq!(meta0.base_offset, 0);
-        assert_eq!(meta0.inverted_index_size, 102);
-        assert_eq!(meta0.relative_null_bitmap_offset, 26);
-        assert_eq!(meta0.null_bitmap_size, 26);
-        assert_eq!(meta0.relative_fst_offset, 52);
+        assert_eq!(meta0.inverted_index_size, 54);
+        assert_eq!(meta0.relative_null_bitmap_offset, 2);
+        assert_eq!(meta0.null_bitmap_size, 2);
+        assert_eq!(meta0.relative_fst_offset, 4);
        assert_eq!(meta0.fst_size, 50);

        let meta1 = metas.metas.get("tag1").unwrap();
        assert_eq!(meta1.name, "tag1");
-        assert_eq!(meta1.base_offset, 102);
-        assert_eq!(meta1.inverted_index_size, 102);
-        assert_eq!(meta1.relative_null_bitmap_offset, 26);
-        assert_eq!(meta1.null_bitmap_size, 26);
-        assert_eq!(meta1.relative_fst_offset, 52);
+        assert_eq!(meta1.base_offset, 54);
+        assert_eq!(meta1.inverted_index_size, 54);
+        assert_eq!(meta1.relative_null_bitmap_offset, 2);
+        assert_eq!(meta1.null_bitmap_size, 2);
+        assert_eq!(meta1.relative_fst_offset, 4);
        assert_eq!(meta1.fst_size, 50);
    }

@@ -234,29 +224,17 @@ mod tests {
        let metas = blob_reader.metadata().await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();

-        let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
-            .await
-            .unwrap();
-        assert_eq!(bitmap, mock_bitmap());
-        let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
-            .await
-            .unwrap();
-        assert_eq!(bitmap, mock_bitmap());
+        let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
+        assert_eq!(bitmap.into_vec(), create_fake_bitmap());
+        let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
+        assert_eq!(bitmap.into_vec(), create_fake_bitmap());

        let metas = blob_reader.metadata().await.unwrap();
        let meta = metas.metas.get("tag1").unwrap();

-        let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
-            .await
-            .unwrap();
-        assert_eq!(bitmap, mock_bitmap());
-        let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
-            .await
-            .unwrap();
-        assert_eq!(bitmap, mock_bitmap());
+        let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
+        assert_eq!(bitmap.into_vec(), create_fake_bitmap());
+        let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
+        assert_eq!(bitmap.into_vec(), create_fake_bitmap());
    }
 }
--- a/src/index/src/inverted_index/format/writer.rs
+++ b/src/index/src/inverted_index/format/writer.rs
@@ -18,14 +18,14 @@ mod single;
 use std::num::NonZeroUsize;

 use async_trait::async_trait;
+use common_base::BitVec;
 use futures::Stream;

-use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::error::Result;
 pub use crate::inverted_index::format::writer::blob::InvertedIndexBlobWriter;
 use crate::Bytes;

-pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;
+pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;

 /// Trait for writing inverted index data to underlying storage.
 #[mockall::automock]
@@ -37,13 +37,11 @@ pub trait InvertedIndexWriter: Send {
    /// * `null_bitmap` marks positions of null entries.
    /// * `values` is a stream of values and their locations, yielded lexicographically.
    ///    Errors occur if the values are out of order.
-    /// * `bitmap_type` is the type of bitmap to encode.
    async fn add_index(
        &mut self,
        name: String,
-        null_bitmap: Bitmap,
+        null_bitmap: BitVec,
        values: ValueStream,
-        bitmap_type: BitmapType,
    ) -> Result<()>;

    /// Finalizes the index writing process, ensuring all data is written.
--- a/src/index/src/inverted_index/format/writer/blob.rs
+++ b/src/index/src/inverted_index/format/writer/blob.rs
@@ -15,12 +15,12 @@
 use std::num::NonZeroUsize;

 use async_trait::async_trait;
+use common_base::BitVec;
 use futures::{AsyncWrite, AsyncWriteExt};
 use greptime_proto::v1::index::InvertedIndexMetas;
 use prost::Message;
 use snafu::ResultExt;

-use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::error::{CloseSnafu, FlushSnafu, Result, WriteSnafu};
 use crate::inverted_index::format::writer::single::SingleIndexWriter;
 use crate::inverted_index::format::writer::{InvertedIndexWriter, ValueStream};
@@ -43,9 +43,8 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
    async fn add_index(
        &mut self,
        name: String,
-        null_bitmap: Bitmap,
+        null_bitmap: BitVec,
        values: ValueStream,
-        bitmap_type: BitmapType,
    ) -> Result<()> {
        let single_writer = SingleIndexWriter::new(
            name.clone(),
@@ -53,7 +52,6 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
            null_bitmap,
            values,
            &mut self.blob_writer,
-            bitmap_type,
        );
        let metadata = single_writer.write().await?;

@@ -102,7 +100,6 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexBlobWriter<W> {
 #[cfg(test)]
 mod tests {
    use futures::stream;
-    use greptime_proto::v1::index::BitmapType;

    use super::*;
    use crate::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
@@ -135,44 +132,24 @@ mod tests {
        writer
            .add_index(
                "tag0".to_string(),
-                Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
+                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
                Box::new(stream::iter(vec![
-                    Ok((
-                        Bytes::from("a"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("b"),
-                        Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("c"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
+                    Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
+                    Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
                ])),
-                BitmapType::Roaring,
            )
            .await
            .unwrap();
        writer
            .add_index(
                "tag1".to_string(),
-                Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
+                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
                Box::new(stream::iter(vec![
-                    Ok((
-                        Bytes::from("x"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("y"),
-                        Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("z"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
+                    Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
+                    Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
                ])),
-                BitmapType::Roaring,
            )
            .await
            .unwrap();
@@ -204,31 +181,22 @@ mod tests {
        assert_eq!(fst0.len(), 3);
        let [offset, size] = unpack(fst0.get(b"a").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag0.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
        let [offset, size] = unpack(fst0.get(b"b").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag0.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
        let [offset, size] = unpack(fst0.get(b"c").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag0.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));

        // tag1
        let tag1 = metadata.metas.get("tag1").unwrap();
@@ -247,30 +215,21 @@ mod tests {
        assert_eq!(fst1.len(), 3);
        let [offset, size] = unpack(fst1.get(b"x").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag1.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
        let [offset, size] = unpack(fst1.get(b"y").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag1.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
        let [offset, size] = unpack(fst1.get(b"z").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag1.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
    }
 }
--- a/src/index/src/inverted_index/format/writer/single.rs
+++ b/src/index/src/inverted_index/format/writer/single.rs
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use common_base::BitVec;
 use fst::MapBuilder;
 use futures::{AsyncWrite, AsyncWriteExt, Stream, StreamExt};
 use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexStats};
 use snafu::ResultExt;

-use crate::bitmap::{Bitmap, BitmapType};
 use crate::inverted_index::error::{FstCompileSnafu, FstInsertSnafu, Result, WriteSnafu};
 use crate::Bytes;

@@ -27,7 +27,7 @@ pub struct SingleIndexWriter<W, S> {
    blob_writer: W,

    /// The null bitmap to be written
-    null_bitmap: Bitmap,
+    null_bitmap: BitVec,

    /// The stream of values to be written, yielded lexicographically
    values: S,
@@ -37,40 +37,30 @@ pub struct SingleIndexWriter<W, S> {

    /// Metadata about the index
    meta: InvertedIndexMeta,
-
-    /// The type of bitmap to use
-    bitmap_type: BitmapType,
-
-    /// Buffer for writing the blob
-    buf: Vec<u8>,
 }

 impl<W, S> SingleIndexWriter<W, S>
 where
    W: AsyncWrite + Send + Unpin,
-    S: Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin,
+    S: Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin,
 {
    /// Constructs a new `SingleIndexWriter`
    pub fn new(
        name: String,
        base_offset: u64,
-        null_bitmap: Bitmap,
+        null_bitmap: BitVec,
        values: S,
        blob_writer: W,
-        bitmap_type: BitmapType,
    ) -> SingleIndexWriter<W, S> {
        SingleIndexWriter {
            blob_writer,
            null_bitmap,
            values,
            fst: MapBuilder::memory(),
-            bitmap_type,
-            buf: Vec::new(),
            meta: InvertedIndexMeta {
                name,
                base_offset,
                stats: Some(InvertedIndexStats::default()),
-                bitmap_type: bitmap_type.into(),
                ..Default::default()
            },
        }
@@ -90,17 +80,14 @@ where

    /// Writes the null bitmap to the blob and updates the metadata accordingly
    async fn write_null_bitmap(&mut self) -> Result<()> {
-        self.buf.clear();
-        self.null_bitmap
-            .serialize_into(self.bitmap_type, &mut self.buf)
-            .expect("Write to vec should not fail");
+        let null_bitmap_bytes = self.null_bitmap.as_raw_slice();
        self.blob_writer
-            .write_all(&self.buf)
+            .write_all(null_bitmap_bytes)
            .await
            .context(WriteSnafu)?;

        self.meta.relative_null_bitmap_offset = self.meta.inverted_index_size as _;
-        self.meta.null_bitmap_size = self.buf.len() as _;
+        self.meta.null_bitmap_size = null_bitmap_bytes.len() as _;
        self.meta.inverted_index_size += self.meta.null_bitmap_size as u64;

        // update stats
@@ -113,18 +100,15 @@ where
    }

    /// Appends a value and its bitmap to the blob, updates the FST, and the metadata
-    async fn append_value(&mut self, value: Bytes, bitmap: Bitmap) -> Result<()> {
-        self.buf.clear();
-        bitmap
-            .serialize_into(self.bitmap_type, &mut self.buf)
-            .expect("Write to vec should not fail");
+    async fn append_value(&mut self, value: Bytes, bitmap: BitVec) -> Result<()> {
+        let bitmap_bytes = bitmap.into_vec();
        self.blob_writer
-            .write_all(&self.buf)
+            .write_all(&bitmap_bytes)
            .await
            .context(WriteSnafu)?;

        let offset = self.meta.inverted_index_size as u32;
-        let size = self.buf.len() as u32;
+        let size = bitmap_bytes.len() as u32;
        self.meta.inverted_index_size += size as u64;

        let packed = bytemuck::cast::<[u32; 2], u64>([offset, size]);
@@ -173,10 +157,9 @@ mod tests {
        let writer = SingleIndexWriter::new(
            "test".to_string(),
            0,
-            Bitmap::new_roaring(),
+            BitVec::new(),
            stream::empty(),
            &mut blob,
-            BitmapType::Roaring,
        );

        let meta = writer.write().await.unwrap();
@@ -191,23 +174,13 @@ mod tests {
        let writer = SingleIndexWriter::new(
            "test".to_string(),
            0,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
+            BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
            stream::iter(vec![
-                Ok((
-                    Bytes::from("a"),
-                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                )),
-                Ok((
-                    Bytes::from("b"),
-                    Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
-                )),
-                Ok((
-                    Bytes::from("c"),
-                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                )),
+                Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
+                Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
+                Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
            ]),
            &mut blob,
-            BitmapType::Roaring,
        );
        let meta = writer.write().await.unwrap();

@@ -226,23 +199,13 @@ mod tests {
        let writer = SingleIndexWriter::new(
            "test".to_string(),
            0,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
+            BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
            stream::iter(vec![
-                Ok((
-                    Bytes::from("b"),
-                    Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
-                )),
-                Ok((
-                    Bytes::from("a"),
-                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                )),
-                Ok((
-                    Bytes::from("c"),
-                    Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                )),
+                Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
+                Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
+                Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
            ]),
            &mut blob,
-            BitmapType::Roaring,
        );
        let res = writer.write().await;
        assert!(matches!(res, Err(Error::FstInsert { .. })));
--- a/src/index/src/inverted_index/search/fst_values_mapper.rs
+++ b/src/index/src/inverted_index/search/fst_values_mapper.rs
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
+use common_base::BitVec;
+use greptime_proto::v1::index::InvertedIndexMeta;

-use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
 use crate::inverted_index::format::reader::InvertedIndexReader;

@@ -36,7 +36,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
    pub async fn map_values_vec(
        &mut self,
        value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
-    ) -> Result<Vec<Bitmap>> {
+    ) -> Result<Vec<BitVec>> {
        let groups = value_and_meta_vec
            .iter()
            .map(|(values, _)| values.len())
@@ -50,17 +50,15 @@ impl<'a> ParallelFstValuesMapper<'a> {
                // bitmap offset and the lower 32 bits represent its size. This mapper uses these
                // combined offset-size pairs to fetch and union multiple bitmaps into a single `BitVec`.
                let [relative_offset, size] = bytemuck::cast::<u64, [u32; 2]>(*value);
-                let range = meta.base_offset + relative_offset as u64
-                    ..meta.base_offset + relative_offset as u64 + size as u64;
-                fetch_ranges.push((
-                    range,
-                    BitmapType::try_from(meta.bitmap_type).unwrap_or(BitmapType::BitVec),
-                ));
+                fetch_ranges.push(
+                    meta.base_offset + relative_offset as u64
+                        ..meta.base_offset + relative_offset as u64 + size as u64,
+                );
            }
        }

        if fetch_ranges.is_empty() {
-            return Ok(vec![Bitmap::new_bitvec()]);
+            return Ok(vec![BitVec::new()]);
        }

        common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
@@ -68,10 +66,14 @@ impl<'a> ParallelFstValuesMapper<'a> {
        let mut output = Vec::with_capacity(groups.len());

        for counter in groups {
-            let mut bitmap = Bitmap::new_roaring();
+            let mut bitmap = BitVec::new();
            for _ in 0..counter {
                let bm = bitmaps.pop_front().unwrap();
-                bitmap.union(bm);
+                if bm.len() > bitmap.len() {
+                    bitmap = bm | bitmap
+                } else {
+                    bitmap |= bm
+                }
            }

            output.push(bitmap);
@@ -85,6 +87,8 @@ impl<'a> ParallelFstValuesMapper<'a> {
 mod tests {
    use std::collections::VecDeque;

+    use common_base::bit_vec::prelude::*;
+
    use super::*;
    use crate::inverted_index::format::reader::MockInvertedIndexReader;

@@ -97,26 +101,19 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader.expect_bitmap_deque().returning(|ranges| {
            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
+            for range in ranges {
                let offset = range.start;
                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
-                    }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
-                    }
+                match (offset, size) {
+                    (1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]),
+                    (2, 1) => output.push_back(bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]),
                    _ => unreachable!(),
                }
            }
            Ok(output)
        });

-        let meta = InvertedIndexMeta {
-            bitmap_type: BitmapType::Roaring.into(),
-            ..Default::default()
-        };
+        let meta = InvertedIndexMeta::default();
        let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);

        let result = values_mapper
@@ -129,50 +126,33 @@ mod tests {
            .map_values_vec(&[(vec![value(1, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(
-            result[0],
-            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
-        );
+        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);

        let result = values_mapper
            .map_values_vec(&[(vec![value(2, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(
-            result[0],
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
-        );
+        assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);

        let result = values_mapper
            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(
-            result[0],
-            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
-        );
+        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);

        let result = values_mapper
            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(
-            result[0],
-            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
-        );
+        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);

        let result = values_mapper
            .map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
            .await
            .unwrap();
-        assert_eq!(
-            result[0],
-            Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
-        );
-        assert_eq!(
-            result[1],
-            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
-        );
+        assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);
+        assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
+
        let result = values_mapper
            .map_values_vec(&[
                (vec![value(2, 1), value(1, 1)], &meta),
@@ -180,13 +160,7 @@ mod tests {
            ])
            .await
            .unwrap();
-        assert_eq!(
-            result[0],
-            Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
-        );
-        assert_eq!(
-            result[1],
-            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
-        );
+        assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
+        assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
    }
 }
--- a/src/index/src/inverted_index/search/index_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply.rs
@@ -15,17 +15,17 @@
 mod predicates_apply;

 use async_trait::async_trait;
+use common_base::BitVec;
 pub use predicates_apply::PredicatesIndexApplier;

-use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
 use crate::inverted_index::format::reader::InvertedIndexReader;

 /// The output of an apply operation.
-#[derive(Clone, Debug, PartialEq)]
+#[derive(Clone, Debug, Eq, PartialEq)]
 pub struct ApplyOutput {
    /// Bitmap of indices that match the predicates.
-    pub matched_segment_ids: Bitmap,
+    pub matched_segment_ids: BitVec,

    /// The total number of rows in the index.
    pub total_row_count: usize,
--- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
@@ -15,9 +15,9 @@
 use std::mem::size_of;

 use async_trait::async_trait;
+use common_base::BitVec;
 use greptime_proto::v1::index::InvertedIndexMetas;

-use crate::bitmap::Bitmap;
 use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
 use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::search::fst_apply::{
@@ -50,11 +50,12 @@ impl IndexApplier for PredicatesIndexApplier {
    ) -> Result<ApplyOutput> {
        let metadata = reader.metadata().await?;
        let mut output = ApplyOutput {
-            matched_segment_ids: Bitmap::new_bitvec(),
+            matched_segment_ids: BitVec::EMPTY,
            total_row_count: metadata.total_row_count as _,
            segment_row_count: metadata.segment_row_count as _,
        };

+        let mut bitmap = Self::bitmap_full_range(&metadata);
        // TODO(zhongzc): optimize the order of applying to make it quicker to return empty.
        let mut appliers = Vec::with_capacity(self.fst_appliers.len());
        let mut fst_ranges = Vec::with_capacity(self.fst_appliers.len());
@@ -80,7 +81,7 @@ impl IndexApplier for PredicatesIndexApplier {
        }

        if fst_ranges.is_empty() {
-            output.matched_segment_ids = Self::bitmap_full_range(&metadata);
+            output.matched_segment_ids = bitmap;
            return Ok(output);
        }

@@ -92,15 +93,14 @@ impl IndexApplier for PredicatesIndexApplier {
            .collect::<Vec<_>>();

        let mut mapper = ParallelFstValuesMapper::new(reader);
-        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
+        let bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;

-        let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
        for bm in bm_vec {
-            if bm.count_ones() == 0 {
+            if bitmap.count_ones() == 0 {
                break;
            }

-            bitmap.intersect(bm);
+            bitmap &= bm;
        }

        output.matched_segment_ids = bitmap;
@@ -146,12 +146,12 @@ impl PredicatesIndexApplier {
        Ok(PredicatesIndexApplier { fst_appliers })
    }

-    /// Creates a `Bitmap` representing the full range of data in the index for initial scanning.
-    fn bitmap_full_range(metadata: &InvertedIndexMetas) -> Bitmap {
+    /// Creates a `BitVec` representing the full range of data in the index for initial scanning.
+    fn bitmap_full_range(metadata: &InvertedIndexMetas) -> BitVec {
        let total_count = metadata.total_row_count;
        let segment_count = metadata.segment_row_count;
        let len = total_count.div_ceil(segment_count);
-        Bitmap::full_bitvec(len as _)
+        BitVec::repeat(true, len as _)
    }
 }

@@ -167,10 +167,10 @@ mod tests {
    use std::collections::VecDeque;
    use std::sync::Arc;

-    use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
+    use common_base::bit_vec::prelude::*;
+    use greptime_proto::v1::index::InvertedIndexMeta;

    use super::*;
-    use crate::bitmap::Bitmap;
    use crate::inverted_index::error::Error;
    use crate::inverted_index::format::reader::MockInvertedIndexReader;
    use crate::inverted_index::search::fst_apply::MockFstApplier;
@@ -190,7 +190,6 @@ mod tests {
            let meta = InvertedIndexMeta {
                name: s(tag),
                relative_fst_offset: idx,
-                bitmap_type: BitmapType::Roaring.into(),
                ..Default::default()
            };
            metas.metas.insert(s(tag), meta);
@@ -230,16 +229,10 @@ mod tests {
            .unwrap()])
        });

-        mock_reader.expect_bitmap_deque().returning(|arg| {
-            assert_eq!(arg.len(), 1);
-            let range = &arg[0].0;
-            let bitmap_type = arg[0].1;
-            assert_eq!(*range, 2..3);
-            assert_eq!(bitmap_type, BitmapType::Roaring);
-            Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
-                &[0b10101010],
-                bitmap_type,
-            )]))
+        mock_reader.expect_bitmap_deque().returning(|range| {
+            assert_eq!(range.len(), 1);
+            assert_eq!(range[0], 2..3);
+            Ok(VecDeque::from([bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]]))
        });
        let output = applier
            .apply(SearchContext::default(), &mut mock_reader)
@@ -247,7 +240,7 @@ mod tests {
            .unwrap();
        assert_eq!(
            output.matched_segment_ids,
-            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
+            bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]
        );

        // An index reader with a single tag "tag-0" but without value "tag-0_value-0"
@@ -299,16 +292,12 @@ mod tests {
        });
        mock_reader.expect_bitmap_deque().returning(|ranges| {
            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
+            for range in ranges {
                let offset = range.start;
                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
-                    }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
-                    }
+                match (offset, size) {
+                    (1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]),
+                    (2, 1) => output.push_back(bitvec![u8, Lsb0; 1, 1, 0, 1, 1, 0, 1, 1]),
                    _ => unreachable!(),
                }
            }
@@ -322,7 +311,7 @@ mod tests {
            .unwrap();
        assert_eq!(
            output.matched_segment_ids,
-            Bitmap::from_lsb0_bytes(&[0b10001010], BitmapType::Roaring)
+            bitvec![u8, Lsb0; 1, 0, 0, 0, 1, 0, 1, 0]
        );
    }

@@ -341,7 +330,10 @@ mod tests {
            .apply(SearchContext::default(), &mut mock_reader)
            .await
            .unwrap();
-        assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
+        assert_eq!(
+            output.matched_segment_ids,
+            bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
+        ); // full range to scan
    }

    #[tokio::test]
@@ -413,7 +405,10 @@ mod tests {
            )
            .await
            .unwrap();
-        assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8));
+        assert_eq!(
+            output.matched_segment_ids,
+            bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
+        );
    }

    #[test]
--- a/src/index/src/lib.rs
+++ b/src/index/src/lib.rs
@@ -15,7 +15,6 @@
 #![feature(iter_partition_in_place)]
 #![feature(assert_matches)]

-pub mod bitmap;
 pub mod bloom_filter;
 pub mod error;
 pub mod external_provider;
--- a/src/ingester/Cargo.toml
+++ b/src/ingester/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "ingester"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+clap.workspace = true
+common-telemetry.workspace = true
+common-time.workspace = true
+datanode.workspace = true
+meta-client.workspace = true
+mito2.workspace = true
+object-store.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+sst-convert.workspace = true
+tokio.workspace = true
+toml.workspace = true
+
+[lints]
+workspace = true
--- a/src/ingester/src/main.rs
+++ b/src/ingester/src/main.rs
@@ -0,0 +1,294 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use clap::Parser;
+use common_telemetry::info;
+use common_time::timestamp::TimeUnit;
+use datanode::config::StorageConfig;
+use meta_client::MetaClientOptions;
+use mito2::config::MitoConfig;
+use mito2::sst::file::IndexType;
+use mito2::sst::parquet::SstInfo;
+use serde::{Deserialize, Serialize};
+use sst_convert::converter::{InputFile, InputFileType, SstConverterBuilder};
+use tokio::sync::oneshot;
+
+#[derive(Parser, Debug)]
+#[command(version, about = "Greptime Ingester", long_about = None)]
+struct Args {
+    /// Input directory
+    #[arg(short, long)]
+    input_dir: String,
+    /// Directory of input parquet files, relative to input_dir
+    #[arg(short, long)]
+    parquet_dir: Option<String>,
+    /// Directory of input json files, relative to input_dir
+    #[arg(short, long)]
+    remote_write_dir: Option<String>,
+    /// Config file
+    #[arg(short, long)]
+    cfg: String,
+    /// DB HTTP address
+    #[arg(short, long)]
+    db_http_addr: String,
+
+    /// Output path for the converted SST files.
+    /// If it is not None, the converted SST files will be written to the specified path
+    /// in the `input_store`.
+    /// This is for debugging purposes.
+    #[arg(short, long)]
+    sst_output_path: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+struct IngesterConfig {
+    meta_client: MetaClientOptions,
+    storage: StorageConfig,
+    mito: MitoConfig,
+}
+
+pub const APP_NAME: &str = "greptime-ingester";
+
+#[tokio::main]
+async fn main() {
+    let _guard = common_telemetry::init_global_logging(
+        APP_NAME,
+        &Default::default(),
+        &Default::default(),
+        None,
+    );
+
+    let args = Args::parse();
+
+    let cfg_file = std::fs::read_to_string(&args.cfg).expect("Failed to read config file");
+    let cfg: IngesterConfig = toml::from_str(&cfg_file).expect("Failed to parse config");
+
+    let sst_builder = {
+        let mut builder = SstConverterBuilder::new_fs(args.input_dir)
+            .with_meta_options(cfg.meta_client)
+            .with_storage_config(cfg.storage)
+            .with_config(cfg.mito);
+
+        if let Some(output_path) = args.sst_output_path {
+            builder = builder.with_output_path(output_path);
+        }
+
+        builder
+    };
+
+    let sst_converter = sst_builder
+        .clone()
+        .build()
+        .await
+        .expect("Failed to build sst converter");
+
+    let input_store = sst_converter.input_store.clone();
+
+    if let Some(parquet_dir) = args.parquet_dir {
+        // using opendal to read parquet files in given input object store
+        let all_parquets = input_store
+            .list(&parquet_dir)
+            .await
+            .expect("Failed to list parquet files");
+        info!("Listed all files in parquet directory: {:?}", all_parquets);
+        let all_parquets = all_parquets
+            .iter()
+            .filter(|parquet| parquet.name().ends_with(".parquet") && parquet.metadata().is_file())
+            .collect::<Vec<_>>();
+
+        let input_files = all_parquets
+            .iter()
+            .map(|parquet| {
+                let full_table_name = parquet.name().split("-").next().unwrap();
+                let (catalog_name, schema_name, table_name) = extract_name(full_table_name);
+
+                info!(
+                    "catalog: {}, schema: {}, table: {}",
+                    catalog_name, schema_name, table_name
+                );
+
+                InputFile {
+                    catalog: catalog_name.to_string(),
+                    schema: schema_name.to_string(),
+                    table: table_name.to_string(),
+                    path: parquet.path().to_string(),
+                    file_type: InputFileType::Parquet,
+                }
+            })
+            .collect::<Vec<_>>();
+
+        convert_and_send(&input_files, sst_builder.clone(), &args.db_http_addr).await;
+    }
+
+    if let Some(remote_write_dir) = args.remote_write_dir {
+        // using opendal to read parquet files in given input object store
+        let all_parquets = input_store
+            .list(&remote_write_dir)
+            .await
+            .expect("Failed to list parquet files");
+
+        let all_parquets = all_parquets
+            .iter()
+            .filter(|parquet| parquet.name().ends_with(".parquet") && parquet.metadata().is_file())
+            .collect::<Vec<_>>();
+
+        let input_files = all_parquets
+            .iter()
+            .map(|parquet| {
+                let full_table_name = parquet.name().split("-").next().unwrap();
+                let (catalog_name, schema_name, table_name) = extract_name(full_table_name);
+
+                info!(
+                    "catalog: {}, schema: {}, table: {}",
+                    catalog_name, schema_name, table_name
+                );
+                InputFile {
+                    catalog: catalog_name.to_string(),
+                    schema: schema_name.to_string(),
+                    table: table_name.to_string(),
+                    path: parquet.path().to_string(),
+                    file_type: InputFileType::RemoteWrite,
+                }
+            })
+            .collect::<Vec<_>>();
+
+        convert_and_send(&input_files, sst_builder.clone(), &args.db_http_addr).await;
+    }
+}
+
+async fn convert_and_send(
+    input_files: &[InputFile],
+    sst_builder: SstConverterBuilder,
+    db_http_addr: &str,
+) {
+    let table_names = input_files
+        .iter()
+        .map(|f| (f.schema.clone(), f.table.clone()))
+        .collect::<Vec<_>>();
+    let mut rxs = Vec::new();
+
+    // Spawn a task for each input file
+    info!("Spawning tasks for {} input files", input_files.len());
+    for input_file in input_files.iter() {
+        let (tx, rx) = oneshot::channel();
+        let sst_builder = sst_builder.clone();
+        let input_file = (*input_file).clone();
+        tokio::task::spawn(async move {
+            let mut sst_converter = sst_builder
+                .build()
+                .await
+                .expect("Failed to build sst converter");
+            let sst_info = sst_converter
+                .convert_one(&input_file)
+                .await
+                .expect("Failed to convert parquet files");
+            tx.send(sst_info).unwrap();
+        });
+        rxs.push(rx);
+    }
+
+    let mut sst_infos = Vec::new();
+    for rx in rxs {
+        sst_infos.push(rx.await.unwrap());
+    }
+
+    info!("Converted {} input files", sst_infos.len());
+
+    let ingest_reqs = table_names
+        .iter()
+        .zip(sst_infos.iter())
+        .flat_map(|(schema_name, sst_info)| {
+            sst_info
+                .ssts
+                .iter()
+                .map(|sst| to_ingest_sst_req(&schema_name.0, &schema_name.1, sst))
+                .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>();
+
+    // send ingest requests to DB
+    send_ingest_requests(db_http_addr, ingest_reqs)
+        .await
+        .unwrap();
+}
+
+fn extract_name(full_table_name: &str) -> (String, String, String) {
+    let mut names = full_table_name.split('.').rev();
+    let table_name = names.next().unwrap();
+    let schema_name = names.next().unwrap_or("public");
+    let catalog_name = names.next().unwrap_or("greptime");
+    (
+        catalog_name.to_string(),
+        schema_name.to_string(),
+        table_name.to_string(),
+    )
+}
+
+async fn send_ingest_requests(
+    addr: &str,
+    reqs: Vec<ClientIngestSstRequest>,
+) -> Result<(), Box<dyn std::error::Error>> {
+    let client = reqwest::Client::new();
+    for req in reqs {
+        info!("ingesting sst: {req:?}");
+        let req = client.post(addr).json(&req);
+        let resp = req.send().await?;
+        info!("ingest response: {resp:?}");
+    }
+    Ok(())
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct ClientIngestSstRequest {
+    schema: Option<String>,
+    table: String,
+    pub(crate) file_id: String,
+    pub(crate) min_ts: i64,
+    pub(crate) max_ts: i64,
+    pub(crate) file_size: u64,
+    pub(crate) rows: u32,
+    pub(crate) row_groups: u32,
+    /// Available indexes of the file.
+    pub available_indexes: Vec<IndexType>,
+    /// Size of the index file.
+    pub index_file_size: u64,
+    pub time_unit: u32,
+}
+
+fn to_ingest_sst_req(
+    schema_name: &str,
+    table_name: &str,
+    sst_info: &SstInfo,
+) -> ClientIngestSstRequest {
+    let index_file_size = sst_info.index_metadata.file_size;
+    let available_indexs = sst_info.index_metadata.build_available_indexes();
+    ClientIngestSstRequest {
+        schema: Some(schema_name.to_string()),
+        table: table_name.to_string(),
+        file_id: sst_info.file_id.to_string(),
+        min_ts: sst_info.time_range.0.value(),
+        max_ts: sst_info.time_range.1.value(),
+        file_size: sst_info.file_size,
+        rows: sst_info.num_rows as _,
+        row_groups: sst_info.num_row_groups as _,
+        available_indexes: available_indexs.to_vec(),
+        index_file_size,
+        time_unit: match sst_info.time_range.0.unit() {
+            TimeUnit::Second => 0,
+            TimeUnit::Millisecond => 3,
+            TimeUnit::Microsecond => 6,
+            TimeUnit::Nanosecond => 9,
+        },
+    }
+}
--- a/src/log-store/src/error.rs
+++ b/src/log-store/src/error.rs
@@ -40,17 +40,15 @@ pub enum Error {
        actual: String,
    },

-    #[snafu(display("Failed to start log store task: {}", name))]
-    StartWalTask {
-        name: String,
+    #[snafu(display("Failed to start log store gc task"))]
+    StartGcTask {
        #[snafu(implicit)]
        location: Location,
        source: RuntimeError,
    },

-    #[snafu(display("Failed to stop log store task: {}", name))]
-    StopWalTask {
-        name: String,
+    #[snafu(display("Failed to stop log store gc task"))]
+    StopGcTask {
        #[snafu(implicit)]
        location: Location,
        source: RuntimeError,
--- a/src/log-store/src/raft_engine/backend.rs
+++ b/src/log-store/src/raft_engine/backend.rs
@@ -35,7 +35,7 @@ use common_runtime::RepeatedTask;
 use raft_engine::{Config, Engine, LogBatch, ReadableSize, RecoveryMode};
 use snafu::{IntoError, ResultExt};

-use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartWalTaskSnafu};
+use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartGcTaskSnafu};
 use crate::raft_engine::log_store::PurgeExpiredFilesFunction;

 pub(crate) const SYSTEM_NAMESPACE: u64 = 0;
@@ -93,8 +93,7 @@ impl RaftEngineBackend {
        );
        gc_task
            .start(common_runtime::global_runtime())
-            .context(StartWalTaskSnafu { name: "gc_task" })?;
-
+            .context(StartGcTaskSnafu)?;
        Ok(Self {
            engine: RwLock::new(engine),
            _gc_task: gc_task,
--- a/src/log-store/src/raft_engine/log_store.rs
+++ b/src/log-store/src/raft_engine/log_store.rs
@@ -14,6 +14,7 @@

 use std::collections::{hash_map, HashMap};
 use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicI64, Ordering};
 use std::sync::Arc;
 use std::time::Duration;

@@ -31,7 +32,7 @@ use store_api::storage::RegionId;
 use crate::error::{
    AddEntryLogBatchSnafu, DiscontinuousLogIndexSnafu, Error, FetchEntrySnafu,
    IllegalNamespaceSnafu, IllegalStateSnafu, InvalidProviderSnafu, OverrideCompactedEntrySnafu,
-    RaftEngineSnafu, Result, StartWalTaskSnafu, StopWalTaskSnafu,
+    RaftEngineSnafu, Result, StartGcTaskSnafu, StopGcTaskSnafu,
 };
 use crate::metrics;
 use crate::raft_engine::backend::SYSTEM_NAMESPACE;
@@ -45,7 +46,7 @@ pub struct RaftEngineLogStore {
    read_batch_size: usize,
    engine: Arc<Engine>,
    gc_task: RepeatedTask<Error>,
-    sync_task: RepeatedTask<Error>,
+    last_sync_time: AtomicI64,
 }

 pub struct PurgeExpiredFilesFunction {
@@ -82,31 +83,6 @@ impl TaskFunction<Error> for PurgeExpiredFilesFunction {
    }
 }

-pub struct SyncWalTaskFunction {
-    engine: Arc<Engine>,
-}
-
-#[async_trait::async_trait]
-impl TaskFunction<Error> for SyncWalTaskFunction {
-    async fn call(&mut self) -> std::result::Result<(), Error> {
-        let engine = self.engine.clone();
-        if let Err(e) = tokio::task::spawn_blocking(move || engine.sync()).await {
-            error!(e; "Failed to sync raft engine log files");
-        };
-        Ok(())
-    }
-
-    fn name(&self) -> &str {
-        "SyncWalTaskFunction"
-    }
-}
-
-impl SyncWalTaskFunction {
-    pub fn new(engine: Arc<Engine>) -> Self {
-        Self { engine }
-    }
-}
-
 impl RaftEngineLogStore {
    pub async fn try_new(dir: String, config: &RaftEngineConfig) -> Result<Self> {
        let raft_engine_config = Config {
@@ -128,18 +104,13 @@ impl RaftEngineLogStore {
            }),
        );

-        let sync_task = RepeatedTask::new(
-            config.sync_period.unwrap_or(Duration::from_secs(5)),
-            Box::new(SyncWalTaskFunction::new(engine.clone())),
-        );
-
        let log_store = Self {
            sync_write: config.sync_write,
            sync_period: config.sync_period,
            read_batch_size: config.read_batch_size,
            engine,
            gc_task,
-            sync_task,
+            last_sync_time: AtomicI64::new(0),
        };
        log_store.start()?;
        Ok(log_store)
@@ -152,10 +123,7 @@ impl RaftEngineLogStore {
    fn start(&self) -> Result<()> {
        self.gc_task
            .start(common_runtime::global_runtime())
-            .context(StartWalTaskSnafu { name: "gc_task" })?;
-        self.sync_task
-            .start(common_runtime::global_runtime())
-            .context(StartWalTaskSnafu { name: "sync_task" })
+            .context(StartGcTaskSnafu)
    }

    fn span(&self, provider: &RaftEngineProvider) -> (Option<u64>, Option<u64>) {
@@ -252,14 +220,7 @@ impl LogStore for RaftEngineLogStore {
    type Error = Error;

    async fn stop(&self) -> Result<()> {
-        self.gc_task
-            .stop()
-            .await
-            .context(StopWalTaskSnafu { name: "gc_task" })?;
-        self.sync_task
-            .stop()
-            .await
-            .context(StopWalTaskSnafu { name: "sync_task" })
+        self.gc_task.stop().await.context(StopGcTaskSnafu)
    }

    /// Appends a batch of entries to logstore. `RaftEngineLogStore` assures the atomicity of
@@ -279,9 +240,20 @@ impl LogStore for RaftEngineLogStore {
        }

        let (mut batch, last_entry_ids) = self.entries_to_batch(entries)?;
+
+        let mut sync = self.sync_write;
+
+        if let Some(sync_period) = &self.sync_period {
+            let now = common_time::util::current_time_millis();
+            if now - self.last_sync_time.load(Ordering::Relaxed) >= sync_period.as_millis() as i64 {
+                self.last_sync_time.store(now, Ordering::Relaxed);
+                sync = true;
+            }
+        }
+
        let _ = self
            .engine
-            .write(&mut batch, self.sync_write)
+            .write(&mut batch, sync)
            .context(RaftEngineSnafu)?;

        Ok(AppendBatchResponse { last_entry_ids })
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -111,7 +111,6 @@ impl MetaClientBuilder {
            .enable_store()
            .enable_heartbeat()
            .enable_procedure()
-            .enable_access_cluster_info()
    }

    pub fn enable_heartbeat(self) -> Self {
--- a/src/meta-srv/Cargo.toml
+++ b/src/meta-srv/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [features]
 mock = []
 pg_kvbackend = ["dep:tokio-postgres", "common-meta/pg_kvbackend"]
-mysql_kvbackend = []                                              # placeholder features so CI can compile

 [lints]
 workspace = true
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -335,10 +335,6 @@ impl MetricEngine {
        }
    }

-    pub fn mito(&self) -> MitoEngine {
-        self.inner.mito.clone()
-    }
-
    pub async fn logical_regions(&self, physical_region_id: RegionId) -> Result<Vec<RegionId>> {
        self.inner
            .metadata_region
--- a/src/metric-engine/src/lib.rs
+++ b/src/metric-engine/src/lib.rs
@@ -59,7 +59,7 @@ pub mod engine;
 pub mod error;
 mod metadata_region;
 mod metrics;
-mod row_modifier;
+pub mod row_modifier;
 #[cfg(test)]
 mod test_util;
 mod utils;
--- a/src/metric-engine/src/metadata_region.rs
+++ b/src/metric-engine/src/metadata_region.rs
@@ -338,7 +338,6 @@ impl MetadataRegion {
            limit: None,
            series_row_selector: None,
            sequence: None,
-            distribution: None,
        }
    }

@@ -528,7 +527,6 @@ impl MetadataRegion {
            limit: None,
            series_row_selector: None,
            sequence: None,
-            distribution: None,
        };
        let record_batch_stream = self
            .mito
--- a/src/metric-engine/src/row_modifier.rs
+++ b/src/metric-engine/src/row_modifier.rs
@@ -40,7 +40,7 @@ const TSID_HASH_SEED: u32 = 846793005;
 ///
 /// - For [`PrimaryKeyEncoding::Dense`] encoding,
 ///   it adds two columns(`__table_id`, `__tsid`) to the row.
-pub struct RowModifier {
+pub(crate) struct RowModifier {
    codec: SparsePrimaryKeyCodec,
 }

@@ -52,7 +52,7 @@ impl RowModifier {
    }

    /// Modify rows with the given primary key encoding.
-    pub fn modify_rows(
+    pub(crate) fn modify_rows(
        &self,
        iter: RowsIter,
        table_id: TableId,
@@ -145,16 +145,14 @@ impl RowModifier {

    /// Fills internal columns of a row with table name and a hash of tag values.
    fn fill_internal_columns(&self, table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
-        let mut hasher = mur3::Hasher128::with_seed(TSID_HASH_SEED);
+        let mut hasher = TsidGenerator::default();
        for (name, value) in iter.primary_keys_with_name() {
            // The type is checked before. So only null is ignored.
            if let Some(ValueData::StringValue(string)) = &value.value_data {
-                name.hash(&mut hasher);
-                string.hash(&mut hasher);
+                hasher.write_label(name, string);
            }
        }
-        // TSID is 64 bits, simply truncate the 128 bits hash
-        let (hash, _) = hasher.finish128();
+        let hash = hasher.finish();

        (
            ValueData::U32Value(table_id).into(),
@@ -163,6 +161,34 @@ impl RowModifier {
    }
 }

+/// Tsid generator.
+pub struct TsidGenerator {
+    hasher: mur3::Hasher128,
+}
+
+impl Default for TsidGenerator {
+    fn default() -> Self {
+        Self {
+            hasher: mur3::Hasher128::with_seed(TSID_HASH_SEED),
+        }
+    }
+}
+
+impl TsidGenerator {
+    /// Writes a label pair to the generator.
+    pub fn write_label(&mut self, name: &str, value: &str) {
+        name.hash(&mut self.hasher);
+        value.hash(&mut self.hasher);
+    }
+
+    /// Generates a new TSID.
+    pub fn finish(&mut self) -> u64 {
+        // TSID is 64 bits, simply truncate the 128 bits hash
+        let (hash, _) = self.hasher.finish128();
+        hash
+    }
+}
+
 /// Index of a value.
 #[derive(Debug, Clone, Copy)]
 struct ValueIndex {
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -121,7 +121,7 @@ impl AccessLayer {
    /// Writes a SST with specific `file_id` and `metadata` to the layer.
    ///
    /// Returns the info of the SST. If no data written, returns None.
-    pub(crate) async fn write_sst(
+    pub async fn write_sst(
        &self,
        request: SstWriteRequest,
        write_opts: &WriteOptions,
@@ -191,26 +191,26 @@ impl AccessLayer {

 /// `OperationType` represents the origin of the `SstWriteRequest`.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub(crate) enum OperationType {
+pub enum OperationType {
    Flush,
    Compact,
 }

 /// Contents to build a SST.
-pub(crate) struct SstWriteRequest {
-    pub(crate) op_type: OperationType,
-    pub(crate) metadata: RegionMetadataRef,
-    pub(crate) source: Source,
-    pub(crate) cache_manager: CacheManagerRef,
+pub struct SstWriteRequest {
+    pub op_type: OperationType,
+    pub metadata: RegionMetadataRef,
+    pub source: Source,
+    pub cache_manager: CacheManagerRef,
    #[allow(dead_code)]
-    pub(crate) storage: Option<String>,
-    pub(crate) max_sequence: Option<SequenceNumber>,
+    pub storage: Option<String>,
+    pub max_sequence: Option<SequenceNumber>,

    /// Configs for index
-    pub(crate) index_options: IndexOptions,
-    pub(crate) inverted_index_config: InvertedIndexConfig,
-    pub(crate) fulltext_index_config: FulltextIndexConfig,
-    pub(crate) bloom_filter_index_config: BloomFilterConfig,
+    pub index_options: IndexOptions,
+    pub inverted_index_config: InvertedIndexConfig,
+    pub fulltext_index_config: FulltextIndexConfig,
+    pub bloom_filter_index_config: BloomFilterConfig,
 }

 pub(crate) async fn new_fs_cache_store(root: &str) -> Result<ObjectStore> {
--- a/src/mito2/src/cache/index/inverted_index.rs
+++ b/src/mito2/src/cache/index/inverted_index.rs
@@ -127,8 +127,8 @@ impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobRead
 mod test {
    use std::num::NonZeroUsize;

+    use common_base::BitVec;
    use futures::stream;
-    use index::bitmap::{Bitmap, BitmapType};
    use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
    use index::inverted_index::format::writer::{InvertedIndexBlobWriter, InvertedIndexWriter};
    use index::Bytes;
@@ -191,44 +191,24 @@ mod test {
        writer
            .add_index(
                "tag0".to_string(),
-                Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
+                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
                Box::new(stream::iter(vec![
-                    Ok((
-                        Bytes::from("a"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("b"),
-                        Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("c"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
+                    Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
+                    Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
                ])),
-                index::bitmap::BitmapType::Roaring,
            )
            .await
            .unwrap();
        writer
            .add_index(
                "tag1".to_string(),
-                Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
+                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
                Box::new(stream::iter(vec![
-                    Ok((
-                        Bytes::from("x"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("y"),
-                        Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
-                    )),
-                    Ok((
-                        Bytes::from("z"),
-                        Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
-                    )),
+                    Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
+                    Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
                ])),
-                index::bitmap::BitmapType::Roaring,
            )
            .await
            .unwrap();
@@ -287,31 +267,22 @@ mod test {
        assert_eq!(fst0.len(), 3);
        let [offset, size] = unpack(fst0.get(b"a").unwrap());
        let bitmap = cached_reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag0.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
        let [offset, size] = unpack(fst0.get(b"b").unwrap());
        let bitmap = cached_reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag0.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
        let [offset, size] = unpack(fst0.get(b"c").unwrap());
        let bitmap = cached_reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag0.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));

        // tag1
        let tag1 = metadata.metas.get("tag1").unwrap();
@@ -330,31 +301,22 @@ mod test {
        assert_eq!(fst1.len(), 3);
        let [offset, size] = unpack(fst1.get(b"x").unwrap());
        let bitmap = cached_reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag1.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
        let [offset, size] = unpack(fst1.get(b"y").unwrap());
        let bitmap = cached_reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag1.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
        let [offset, size] = unpack(fst1.get(b"z").unwrap());
        let bitmap = cached_reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(tag1.base_offset + offset as u64, size)
            .await
            .unwrap();
-        assert_eq!(
-            bitmap,
-            Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
-        );
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));

        // fuzz test
        let mut rng = rand::thread_rng();
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -46,6 +46,7 @@ const INDEX_CREATE_MEM_THRESHOLD_FACTOR: u64 = 16;
 pub(crate) const FETCH_OPTION_TIMEOUT: Duration = Duration::from_secs(3);

 /// Configuration for [MitoEngine](crate::engine::MitoEngine).
+/// Before using the config, make sure to call `MitoConfig::validate()` to check if the config is valid.
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 #[serde(default)]
 pub struct MitoConfig {
--- a/src/mito2/src/engine/projection_test.rs
+++ b/src/mito2/src/engine/projection_test.rs
@@ -80,7 +80,6 @@ async fn test_scan_projection() {
        limit: None,
        series_row_selector: None,
        sequence: None,
-        distribution: None,
    };
    let stream = engine.scan_to_stream(region_id, request).await.unwrap();
    let batches = RecordBatches::try_collect(stream).await.unwrap();
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -42,6 +42,13 @@ use crate::worker::WorkerId;
 #[snafu(visibility(pub))]
 #[stack_trace_debug]
 pub enum Error {
+    #[snafu(display("External error, context: {}", context))]
+    External {
+        source: BoxedError,
+        context: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
    #[snafu(display("Failed to encode sparse primary key, reason: {}", reason))]
    EncodeSparsePrimaryKey {
        reason: String,
@@ -1085,7 +1092,7 @@ impl ErrorExt for Error {
            | PuffinPurgeStager { source, .. } => source.status_code(),
            CleanDir { .. } => StatusCode::Unexpected,
            InvalidConfig { .. } => StatusCode::InvalidArguments,
-            StaleLogEntry { .. } => StatusCode::Unexpected,
+            StaleLogEntry { .. } | External { .. } => StatusCode::Unexpected,

            FilterRecordBatch { source, .. } => source.status_code(),

--- a/src/mito2/src/lib.rs
+++ b/src/mito2/src/lib.rs
@@ -23,8 +23,8 @@
 #[cfg_attr(feature = "test", allow(unused))]
 pub mod test_util;

-mod access_layer;
-mod cache;
+pub mod access_layer;
+pub mod cache;
 pub mod compaction;
 pub mod config;
 pub mod engine;
--- a/src/mito2/src/read/range.rs
+++ b/src/mito2/src/read/range.rs
@@ -21,7 +21,6 @@ use common_time::Timestamp;
 use parquet::arrow::arrow_reader::RowSelection;
 use smallvec::{smallvec, SmallVec};
 use store_api::region_engine::PartitionRange;
-use store_api::storage::TimeSeriesDistribution;

 use crate::cache::CacheStrategy;
 use crate::error::Result;
@@ -99,8 +98,8 @@ impl RangeMeta {
        Self::push_seq_file_ranges(input.memtables.len(), &input.files, &mut ranges);

        let ranges = group_ranges_for_seq_scan(ranges);
-        if compaction || input.distribution == Some(TimeSeriesDistribution::PerSeries) {
-            // We don't split ranges in compaction or TimeSeriesDistribution::PerSeries.
+        if compaction {
+            // We don't split ranges in compaction.
            return ranges;
        }
        maybe_split_ranges_for_seq_scan(ranges)
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -31,7 +31,7 @@ use datafusion_expr::Expr;
 use smallvec::SmallVec;
 use store_api::metadata::RegionMetadata;
 use store_api::region_engine::{PartitionRange, RegionScannerRef};
-use store_api::storage::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
+use store_api::storage::{ScanRequest, TimeSeriesRowSelector};
 use table::predicate::{build_time_range_predicate, Predicate};
 use tokio::sync::{mpsc, Semaphore};
 use tokio_stream::wrappers::ReceiverStream;
@@ -287,16 +287,9 @@ impl ScanRegion {

    /// Returns true if the region can use unordered scan for current request.
    fn use_unordered_scan(&self) -> bool {
-        // We use unordered scan when:
-        // 1. The region is in append mode.
-        // 2. There is no series row selector.
-        // 3. The required distribution is None or TimeSeriesDistribution::TimeWindowed.
-        //
+        // If table is append only and there is no series row selector, we use unordered scan in query.
        // We still use seq scan in compaction.
-        self.version.options.append_mode
-            && self.request.series_row_selector.is_none()
-            && (self.request.distribution.is_none()
-                || self.request.distribution == Some(TimeSeriesDistribution::TimeWindowed))
+        self.version.options.append_mode && self.request.series_row_selector.is_none()
    }

    /// Creates a scan input.
@@ -384,8 +377,7 @@ impl ScanRegion {
            .with_append_mode(self.version.options.append_mode)
            .with_filter_deleted(filter_deleted)
            .with_merge_mode(self.version.options.merge_mode())
-            .with_series_row_selector(self.request.series_row_selector)
-            .with_distribution(self.request.distribution);
+            .with_series_row_selector(self.request.series_row_selector);
        Ok(input)
    }

@@ -565,8 +557,6 @@ pub(crate) struct ScanInput {
    pub(crate) merge_mode: MergeMode,
    /// Hint to select rows from time series.
    pub(crate) series_row_selector: Option<TimeSeriesRowSelector>,
-    /// Hint for the required distribution of the scanner.
-    pub(crate) distribution: Option<TimeSeriesDistribution>,
 }

 impl ScanInput {
@@ -591,7 +581,6 @@ impl ScanInput {
            filter_deleted: true,
            merge_mode: MergeMode::default(),
            series_row_selector: None,
-            distribution: None,
        }
    }

@@ -704,16 +693,6 @@ impl ScanInput {
        self
    }

-    /// Sets the distribution hint.
-    #[must_use]
-    pub(crate) fn with_distribution(
-        mut self,
-        distribution: Option<TimeSeriesDistribution>,
-    ) -> Self {
-        self.distribution = distribution;
-        self
-    }
-
    /// Sets the time series row selector.
    #[must_use]
    pub(crate) fn with_series_row_selector(
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -29,7 +29,7 @@ use datatypes::schema::SchemaRef;
 use snafu::ResultExt;
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{PartitionRange, PrepareRequest, RegionScanner, ScannerProperties};
-use store_api::storage::{TimeSeriesDistribution, TimeSeriesRowSelector};
+use store_api::storage::TimeSeriesRowSelector;
 use tokio::sync::Semaphore;

 use crate::error::{PartitionOutOfRangeSnafu, Result};
@@ -206,16 +206,32 @@ impl SeqScan {
            ));
        }

-        if self.stream_ctx.input.distribution == Some(TimeSeriesDistribution::PerSeries) {
-            return self.scan_partition_by_series(partition);
-        }
-
        let stream_ctx = self.stream_ctx.clone();
-        let semaphore = self.new_semaphore();
+        let semaphore = if self.properties.target_partitions() > self.properties.num_partitions() {
+            // We can use additional tasks to read the data if we have more target partitions than actual partitions.
+            // This semaphore is partition level.
+            // We don't use a global semaphore to avoid a partition waiting for others. The final concurrency
+            // of tasks usually won't exceed the target partitions a lot as compaction can reduce the number of
+            // files in a part range.
+            Some(Arc::new(Semaphore::new(
+                self.properties.target_partitions() - self.properties.num_partitions() + 1,
+            )))
+        } else {
+            None
+        };
        let partition_ranges = self.properties.partitions[partition].clone();
        let compaction = self.compaction;
        let distinguish_range = self.properties.distinguish_partition_range;
-        let part_metrics = self.new_partition_metrics(partition);
+        let part_metrics = PartitionMetrics::new(
+            self.stream_ctx.input.mapper.metadata().region_id,
+            partition,
+            get_scanner_type(self.compaction),
+            stream_ctx.query_start,
+            ScannerMetrics {
+                prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
+                ..Default::default()
+            },
+        );

        let stream = try_stream! {
            part_metrics.on_first_poll();
@@ -305,124 +321,6 @@ impl SeqScan {

        Ok(stream)
    }
-
-    /// Scans all ranges in the given partition and merge by time series.
-    /// Otherwise the returned stream might not contains any data.
-    fn scan_partition_by_series(
-        &self,
-        partition: usize,
-    ) -> Result<SendableRecordBatchStream, BoxedError> {
-        let stream_ctx = self.stream_ctx.clone();
-        let semaphore = self.new_semaphore();
-        let partition_ranges = self.properties.partitions[partition].clone();
-        let distinguish_range = self.properties.distinguish_partition_range;
-        let part_metrics = self.new_partition_metrics(partition);
-        debug_assert!(!self.compaction);
-
-        let stream = try_stream! {
-            part_metrics.on_first_poll();
-
-            let range_builder_list = Arc::new(RangeBuilderList::new(
-                stream_ctx.input.num_memtables(),
-                stream_ctx.input.num_files(),
-            ));
-            // Scans all parts.
-            let mut sources = Vec::with_capacity(partition_ranges.len());
-            for part_range in partition_ranges {
-                build_sources(
-                    &stream_ctx,
-                    &part_range,
-                    false,
-                    &part_metrics,
-                    range_builder_list.clone(),
-                    &mut sources,
-                );
-            }
-
-            // Builds a reader that merge sources from all parts.
-            let mut reader =
-                Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone())
-                    .await
-                    .map_err(BoxedError::new)
-                    .context(ExternalSnafu)?;
-            let cache = &stream_ctx.input.cache_strategy;
-            let mut metrics = ScannerMetrics::default();
-            let mut fetch_start = Instant::now();
-
-            while let Some(batch) = reader
-                .next_batch()
-                .await
-                .map_err(BoxedError::new)
-                .context(ExternalSnafu)?
-            {
-                metrics.scan_cost += fetch_start.elapsed();
-                metrics.num_batches += 1;
-                metrics.num_rows += batch.num_rows();
-
-                debug_assert!(!batch.is_empty());
-                if batch.is_empty() {
-                    continue;
-                }
-
-                let convert_start = Instant::now();
-                let record_batch = stream_ctx.input.mapper.convert(&batch, cache)?;
-                metrics.convert_cost += convert_start.elapsed();
-                let yield_start = Instant::now();
-                yield record_batch;
-                metrics.yield_cost += yield_start.elapsed();
-
-                fetch_start = Instant::now();
-            }
-
-            // Yields an empty part to indicate this range is terminated.
-            // The query engine can use this to optimize some queries.
-            if distinguish_range {
-                let yield_start = Instant::now();
-                yield stream_ctx.input.mapper.empty_record_batch();
-                metrics.yield_cost += yield_start.elapsed();
-            }
-
-            metrics.scan_cost += fetch_start.elapsed();
-            part_metrics.merge_metrics(&metrics);
-
-            part_metrics.on_finish();
-        };
-
-        let stream = Box::pin(RecordBatchStreamWrapper::new(
-            self.stream_ctx.input.mapper.output_schema(),
-            Box::pin(stream),
-        ));
-
-        Ok(stream)
-    }
-
-    fn new_semaphore(&self) -> Option<Arc<Semaphore>> {
-        if self.properties.target_partitions() > self.properties.num_partitions() {
-            // We can use additional tasks to read the data if we have more target partitions than actual partitions.
-            // This semaphore is partition level.
-            // We don't use a global semaphore to avoid a partition waiting for others. The final concurrency
-            // of tasks usually won't exceed the target partitions a lot as compaction can reduce the number of
-            // files in a part range.
-            Some(Arc::new(Semaphore::new(
-                self.properties.target_partitions() - self.properties.num_partitions() + 1,
-            )))
-        } else {
-            None
-        }
-    }
-
-    fn new_partition_metrics(&self, partition: usize) -> PartitionMetrics {
-        PartitionMetrics::new(
-            self.stream_ctx.input.mapper.metadata().region_id,
-            partition,
-            get_scanner_type(self.compaction),
-            self.stream_ctx.query_start,
-            ScannerMetrics {
-                prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
-                ..Default::default()
-            },
-        )
-    }
 }

 impl RegionScanner for SeqScan {
@@ -472,7 +370,7 @@ impl fmt::Debug for SeqScan {
    }
 }

-/// Builds sources for the partition range and push them to the `sources` vector.
+/// Builds sources for the partition range.
 fn build_sources(
    stream_ctx: &Arc<StreamContext>,
    part_range: &PartitionRange,
@@ -484,8 +382,8 @@ fn build_sources(
    // Gets range meta.
    let range_meta = &stream_ctx.ranges[part_range.identifier];
    #[cfg(debug_assertions)]
-    if compaction || stream_ctx.input.distribution == Some(TimeSeriesDistribution::PerSeries) {
-        // Compaction or per series distribution expects input sources are not been split.
+    if compaction {
+        // Compaction expects input sources are not been split.
        debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len());
        for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() {
            // It should scan all row groups.
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -14,7 +14,7 @@

 //! Mito region.

-pub(crate) mod opener;
+pub mod opener;
 pub mod options;
 pub(crate) mod version;

--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -15,7 +15,7 @@
 //! Region opener.

 use std::collections::HashMap;
-use std::sync::atomic::AtomicI64;
+use std::sync::atomic::{AtomicI64, AtomicU64};
 use std::sync::Arc;

 use common_telemetry::{debug, error, info, warn};
@@ -27,7 +27,9 @@ use object_store::util::{join_dir, normalize_dir};
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::logstore::provider::Provider;
 use store_api::logstore::LogStore;
-use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
+use store_api::metadata::{
+    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
+};
 use store_api::region_engine::RegionRole;
 use store_api::storage::{ColumnId, RegionId};

@@ -38,6 +40,7 @@ use crate::error::{
    EmptyRegionDirSnafu, InvalidMetadataSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu,
    Result, StaleLogEntrySnafu,
 };
+use crate::manifest::action::RegionManifest;
 use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::manifest::storage::manifest_compress_type;
 use crate::memtable::time_partition::TimePartitions;
@@ -203,11 +206,16 @@ impl RegionOpener {
        }
        // Safety: must be set before calling this method.
        let options = self.options.take().unwrap();
-        let object_store = self.object_store(&options.storage)?.clone();
+        let object_store = get_object_store(&options.storage, &self.object_store_manager)?.clone();
        let provider = self.provider(&options.wal_options);
        let metadata = Arc::new(metadata);
        // Create a manifest manager for this region and writes regions to the manifest file.
-        let region_manifest_options = self.manifest_options(config, &options)?;
+        let region_manifest_options = Self::manifest_options(
+            config,
+            &options,
+            &self.region_dir,
+            &self.object_store_manager,
+        )?;
        let manifest_manager = RegionManifestManager::new(
            metadata.clone(),
            region_manifest_options,
@@ -312,7 +320,12 @@ impl RegionOpener {
    ) -> Result<Option<MitoRegion>> {
        let region_options = self.options.as_ref().unwrap().clone();

-        let region_manifest_options = self.manifest_options(config, &region_options)?;
+        let region_manifest_options = Self::manifest_options(
+            config,
+            &region_options,
+            &self.region_dir,
+            &self.object_store_manager,
+        )?;
        let Some(manifest_manager) = RegionManifestManager::open(
            region_manifest_options,
            self.stats.total_manifest_size.clone(),
@@ -332,7 +345,7 @@ impl RegionOpener {
            .take()
            .unwrap_or_else(|| wal.wal_entry_reader(&provider, region_id, None));
        let on_region_opened = wal.on_region_opened();
-        let object_store = self.object_store(&region_options.storage)?.clone();
+        let object_store = get_object_store(&region_options.storage, &self.object_store_manager)?;

        debug!("Open region {} with options: {:?}", region_id, self.options);

@@ -422,13 +435,14 @@ impl RegionOpener {

    /// Returns a new manifest options.
    fn manifest_options(
-        &self,
        config: &MitoConfig,
        options: &RegionOptions,
+        region_dir: &str,
+        object_store_manager: &ObjectStoreManagerRef,
    ) -> Result<RegionManifestOptions> {
-        let object_store = self.object_store(&options.storage)?.clone();
+        let object_store = get_object_store(&options.storage, object_store_manager)?;
        Ok(RegionManifestOptions {
-            manifest_dir: new_manifest_dir(&self.region_dir),
+            manifest_dir: new_manifest_dir(region_dir),
            object_store,
            // We don't allow users to set the compression algorithm as we use it as a file suffix.
            // Currently, the manifest storage doesn't have good support for changing compression algorithms.
@@ -436,20 +450,72 @@ impl RegionOpener {
            checkpoint_distance: config.manifest_checkpoint_distance,
        })
    }
+}

-    /// Returns an object store corresponding to `name`. If `name` is `None`, this method returns the default object store.
-    fn object_store(&self, name: &Option<String>) -> Result<&object_store::ObjectStore> {
-        if let Some(name) = name {
-            Ok(self
-                .object_store_manager
-                .find(name)
-                .context(ObjectStoreNotFoundSnafu {
-                    object_store: name.to_string(),
-                })?)
-        } else {
-            Ok(self.object_store_manager.default_object_store())
+/// Returns an object store corresponding to `name`. If `name` is `None`, this method returns the default object store.
+pub fn get_object_store(
+    name: &Option<String>,
+    object_store_manager: &ObjectStoreManagerRef,
+) -> Result<object_store::ObjectStore> {
+    if let Some(name) = name {
+        Ok(object_store_manager
+            .find(name)
+            .context(ObjectStoreNotFoundSnafu {
+                object_store: name.to_string(),
+            })?
+            .clone())
+    } else {
+        Ok(object_store_manager.default_object_store().clone())
+    }
+}
+
+/// A loader for loading metadata from a region dir.
+pub struct RegionMetadataLoader {
+    config: Arc<MitoConfig>,
+    object_store_manager: ObjectStoreManagerRef,
+}
+
+impl RegionMetadataLoader {
+    /// Creates a new `RegionOpenerBuilder`.
+    pub fn new(config: Arc<MitoConfig>, object_store_manager: ObjectStoreManagerRef) -> Self {
+        Self {
+            config,
+            object_store_manager,
        }
    }
+
+    /// Loads the metadata of the region from the region dir.
+    pub async fn load(
+        &self,
+        region_dir: &str,
+        region_options: &RegionOptions,
+    ) -> Result<Option<RegionMetadataRef>> {
+        let manifest = self.load_manifest(region_dir, region_options).await?;
+        Ok(manifest.map(|m| m.metadata.clone()))
+    }
+
+    /// Loads the manifest of the region from the region dir.
+    pub async fn load_manifest(
+        &self,
+        region_dir: &str,
+        region_options: &RegionOptions,
+    ) -> Result<Option<Arc<RegionManifest>>> {
+        let region_manifest_options = RegionOpener::manifest_options(
+            &self.config,
+            region_options,
+            region_dir,
+            &self.object_store_manager,
+        )?;
+        let Some(manifest_manager) =
+            RegionManifestManager::open(region_manifest_options, Arc::new(AtomicU64::new(0)))
+                .await?
+        else {
+            return Ok(None);
+        };
+
+        let manifest = manifest_manager.manifest();
+        Ok(Some(manifest))
+    }
 }

 /// Checks whether the recovered region has the same schema as region to create.
--- a/src/mito2/src/row_converter/sparse.rs
+++ b/src/mito2/src/row_converter/sparse.rs
@@ -33,6 +33,8 @@ use crate::row_converter::dense::SortField;
 use crate::row_converter::{CompositeValues, PrimaryKeyCodec, PrimaryKeyFilter};

 /// A codec for sparse key of metrics.
+/// It requires the input primary key columns are sorted by the column name in lexicographical order.
+/// It encodes the column id of the physical region.
 #[derive(Clone, Debug)]
 pub struct SparsePrimaryKeyCodec {
    inner: Arc<SparsePrimaryKeyCodecInner>,
--- a/src/mito2/src/sst/index.rs
+++ b/src/mito2/src/sst/index.rs
@@ -16,9 +16,9 @@ pub(crate) mod bloom_filter;
 mod codec;
 pub(crate) mod fulltext_index;
 mod indexer;
-pub(crate) mod intermediate;
+pub mod intermediate;
 pub(crate) mod inverted_index;
-pub(crate) mod puffin_manager;
+pub mod puffin_manager;
 mod statistics;
 pub(crate) mod store;

--- a/src/mito2/src/sst/index/intermediate.rs
+++ b/src/mito2/src/sst/index/intermediate.rs
@@ -49,6 +49,11 @@ impl IntermediateManager {
    /// Create a new `IntermediateManager` with the given root path.
    /// It will clean up all garbage intermediate files from previous runs.
    pub async fn init_fs(aux_path: impl AsRef<str>) -> Result<Self> {
+        common_telemetry::info!(
+            "Initializing intermediate manager, aux_path: {}",
+            aux_path.as_ref()
+        );
+
        let store = new_fs_cache_store(&normalize_dir(aux_path.as_ref())).await?;
        let store = InstrumentedStore::new(store);

--- a/src/mito2/src/sst/index/inverted_index/applier.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier.rs
@@ -228,8 +228,8 @@ impl Drop for InvertedIndexApplier {

 #[cfg(test)]
 mod tests {
+    use common_base::BitVec;
    use futures::io::Cursor;
-    use index::bitmap::Bitmap;
    use index::inverted_index::search::index_apply::MockIndexApplier;
    use object_store::services::Memory;
    use puffin::puffin_manager::PuffinWriter;
@@ -259,7 +259,7 @@ mod tests {
        mock_index_applier.expect_memory_usage().returning(|| 100);
        mock_index_applier.expect_apply().returning(|_, _| {
            Ok(ApplyOutput {
-                matched_segment_ids: Bitmap::new_bitvec(),
+                matched_segment_ids: BitVec::EMPTY,
                total_row_count: 100,
                segment_row_count: 10,
            })
@@ -276,7 +276,7 @@ mod tests {
        assert_eq!(
            output,
            ApplyOutput {
-                matched_segment_ids: Bitmap::new_bitvec(),
+                matched_segment_ids: BitVec::EMPTY,
                total_row_count: 100,
                segment_row_count: 10,
            }
--- a/src/mito2/src/sst/index/inverted_index/creator.rs
+++ b/src/mito2/src/sst/index/inverted_index/creator.rs
@@ -277,9 +277,7 @@ impl InvertedIndexer {
        let mut index_writer = InvertedIndexBlobWriter::new(tx.compat_write());

        let (index_finish, puffin_add_blob) = futures::join!(
-            // TODO(zhongzc): config bitmap type
-            self.index_creator
-                .finish(&mut index_writer, index::bitmap::BitmapType::Roaring),
+            self.index_creator.finish(&mut index_writer),
            puffin_writer.put_blob(INDEX_BLOB_TYPE, rx.compat(), PutOptions::default())
        );

--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -61,6 +61,7 @@ impl Default for WriteOptions {
 }

 /// Parquet SST info returned by the writer.
+#[derive(Debug)]
 pub struct SstInfo {
    /// SST file id.
    pub file_id: FileId,
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -583,8 +583,6 @@ type RequestBuffer = Vec<WorkerRequest>;
 #[derive(Default)]
 pub(crate) struct StalledRequests {
    /// Stalled requests.
-    /// Remember to use `StalledRequests::stalled_count()` to get the total number of stalled requests
-    /// instead of `StalledRequests::requests.len()`.
    ///
    /// Key: RegionId
    /// Value: (estimated size, stalled requests)
@@ -619,11 +617,6 @@ impl StalledRequests {
            vec![]
        }
    }
-
-    /// Returns the total number of all stalled requests.
-    pub(crate) fn stalled_count(&self) -> usize {
-        self.requests.values().map(|reqs| reqs.1.len()).sum()
-    }
 }

 /// Background worker loop to handle requests.
--- a/src/mito2/src/worker/handle_manifest.rs
+++ b/src/mito2/src/worker/handle_manifest.rs
@@ -329,15 +329,6 @@ async fn edit_region(

            let index_key = IndexKey::new(region_id, file_meta.file_id, FileType::Parquet);
            let remote_path = location::sst_file_path(layer.region_dir(), file_meta.file_id);
-
-            let is_index_exist = file_meta.exists_index();
-            let index_file_size = file_meta.index_file_size();
-
-            let index_file_index_key =
-                IndexKey::new(region_id, file_meta.file_id, FileType::Puffin);
-            let index_remote_path =
-                location::index_file_path(layer.region_dir(), file_meta.file_id);
-
            let file_size = file_meta.file_size;
            common_runtime::spawn_global(async move {
                if write_cache
@@ -354,22 +345,6 @@ async fn edit_region(

                    listener.on_file_cache_filled(index_key.file_id);
                }
-                if is_index_exist {
-                    // also download puffin file
-                    if let Err(err) = write_cache
-                        .download(
-                            index_file_index_key,
-                            &index_remote_path,
-                            layer.object_store(),
-                            index_file_size,
-                        )
-                        .await
-                    {
-                        common_telemetry::error!(
-                            err; "Failed to download puffin file, region_id: {}, index_file_index_key: {:?}, index_remote_path: {}", region_id, index_file_index_key, index_remote_path
-                        );
-                    }
-                }
            });
        }
    }
--- a/src/mito2/src/worker/handle_write.rs
+++ b/src/mito2/src/worker/handle_write.rs
@@ -147,7 +147,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
    pub(crate) async fn handle_stalled_requests(&mut self) {
        // Handle stalled requests.
        let stalled = std::mem::take(&mut self.stalled_requests);
-        self.stalled_count.sub(stalled.stalled_count() as i64);
+        self.stalled_count.sub(stalled.requests.len() as i64);
        // We already stalled these requests, don't stall them again.
        for (_, (_, mut requests)) in stalled.requests {
            self.handle_write_requests(&mut requests, false).await;
@@ -157,7 +157,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
    /// Rejects all stalled requests.
    pub(crate) fn reject_stalled_requests(&mut self) {
        let stalled = std::mem::take(&mut self.stalled_requests);
-        self.stalled_count.sub(stalled.stalled_count() as i64);
+        self.stalled_count.sub(stalled.requests.len() as i64);
        for (_, (_, mut requests)) in stalled.requests {
            reject_write_requests(&mut requests);
        }
--- a/src/operator/src/insert.rs
+++ b/src/operator/src/insert.rs
@@ -74,7 +74,6 @@ pub struct Inserter {
    catalog_manager: CatalogManagerRef,
    partition_manager: PartitionRuleManagerRef,
    node_manager: NodeManagerRef,
-    #[allow(unused)]
    table_flownode_set_cache: TableFlownodeSetCacheRef,
 }

@@ -363,8 +362,6 @@ impl Inserter {
            instant_requests,
        } = requests;

-        // TODO(discord9): mirror some
-
        // Mirror requests for source table to flownode asynchronously
        let flow_mirror_task = FlowMirrorTask::new(
            &self.table_flownode_set_cache,
@@ -898,14 +895,12 @@ struct CreateAlterTableResult {
    table_infos: HashMap<TableId, Arc<TableInfo>>,
 }

-#[allow(unused)]
 struct FlowMirrorTask {
    requests: HashMap<Peer, RegionInsertRequests>,
    num_rows: usize,
 }

 impl FlowMirrorTask {
-    #[allow(unused)]
    async fn new(
        cache: &TableFlownodeSetCacheRef,
        requests: impl Iterator<Item = &RegionInsertRequest>,
@@ -979,7 +974,6 @@ impl FlowMirrorTask {
        })
    }

-    #[allow(unused)]
    fn detach(self, node_manager: NodeManagerRef) -> Result<()> {
        crate::metrics::DIST_MIRROR_PENDING_ROW_COUNT.add(self.num_rows as i64);
        for (peer, inserts) in self.requests {
--- a/src/pipeline/Cargo.toml
+++ b/src/pipeline/Cargo.toml
@@ -41,7 +41,7 @@ futures.workspace = true
 greptime-proto.workspace = true
 itertools.workspace = true
 jsonb.workspace = true
-jsonpath-rust = "0.7.5"
+jsonpath-rust = "0.7.3"
 lazy_static.workspace = true
 moka = { workspace = true, features = ["sync"] }
 once_cell.workspace = true
--- a/src/pipeline/src/etl/value.rs
+++ b/src/pipeline/src/etl/value.rs
@@ -16,13 +16,10 @@ pub mod array;
 pub mod map;
 pub mod time;

-use std::result::Result as StdResult;
-
 pub use array::Array;
 use jsonb::{Number as JsonbNumber, Object as JsonbObject, Value as JsonbValue};
-use jsonpath_rust::parser::{parse_json_path, JsonPathIndex};
 use jsonpath_rust::path::{JsonLike, Path};
-use jsonpath_rust::{jsp_idx, jsp_obj, JsonPath, JsonPathParserError, JsonPathStr};
+use jsonpath_rust::{jsp_idx, jsp_obj};
 pub use map::Map;
 use regex::Regex;
 use snafu::{OptionExt, ResultExt};
@@ -289,52 +286,6 @@ impl Value {
            _ => None,
        }
    }
-
-    // ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L779
-    pub fn pointer(&self, pointer: &str) -> Option<&Value> {
-        if pointer.is_empty() {
-            return Some(self);
-        }
-        if !pointer.starts_with('/') {
-            return None;
-        }
-        pointer
-            .split('/')
-            .skip(1)
-            .map(|x| x.replace("~1", "/").replace("~0", "~"))
-            .try_fold(self, |target, token| match target {
-                Value::Map(map) => map.get(&token),
-                Value::Array(list) => parse_index(&token).and_then(|x| list.get(x)),
-                _ => None,
-            })
-    }
-
-    // ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L834
-    pub fn pointer_mut(&mut self, pointer: &str) -> Option<&mut Value> {
-        if pointer.is_empty() {
-            return Some(self);
-        }
-        if !pointer.starts_with('/') {
-            return None;
-        }
-        pointer
-            .split('/')
-            .skip(1)
-            .map(|x| x.replace("~1", "/").replace("~0", "~"))
-            .try_fold(self, |target, token| match target {
-                Value::Map(map) => map.get_mut(&token),
-                Value::Array(list) => parse_index(&token).and_then(move |x| list.get_mut(x)),
-                _ => None,
-            })
-    }
-}
-
-// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L259
-fn parse_index(s: &str) -> Option<usize> {
-    if s.starts_with('+') || (s.starts_with('0') && s.len() != 1) {
-        return None;
-    }
-    s.parse().ok()
 }

 impl std::fmt::Display for Value {
@@ -863,46 +814,4 @@ impl JsonLike for Value {
    fn null() -> Self {
        Value::Null
    }
-
-    // ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L423
-    fn reference<T>(
-        &self,
-        path: T,
-    ) -> std::result::Result<std::option::Option<&Value>, JsonPathParserError>
-    where
-        T: Into<JsonPathStr>,
-    {
-        Ok(self.pointer(&path_to_json_path(path.into())?))
-    }
-
-    // https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L430
-    fn reference_mut<T>(
-        &mut self,
-        path: T,
-    ) -> std::result::Result<std::option::Option<&mut Value>, JsonPathParserError>
-    where
-        T: Into<JsonPathStr>,
-    {
-        Ok(self.pointer_mut(&path_to_json_path(path.into())?))
-    }
-}
-
-// ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L438
-fn path_to_json_path(path: JsonPathStr) -> StdResult<String, JsonPathParserError> {
-    convert_part(&parse_json_path(path.as_str())?)
-}
-
-// https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L442
-fn convert_part(path: &JsonPath) -> StdResult<String, JsonPathParserError> {
-    match path {
-        JsonPath::Chain(elems) => elems
-            .iter()
-            .map(convert_part)
-            .collect::<StdResult<String, JsonPathParserError>>(),
-
-        JsonPath::Index(JsonPathIndex::Single(v)) => Ok(format!("/{}", v)),
-        JsonPath::Field(e) => Ok(format!("/{}", e)),
-        JsonPath::Root => Ok("".to_string()),
-        e => Err(JsonPathParserError::InvalidJsonPath(e.to_string())),
-    }
 }
--- a/src/promql/Cargo.toml
+++ b/src/promql/Cargo.toml
@@ -16,7 +16,6 @@ common-macro.workspace = true
 common-recordbatch.workspace = true
 common-telemetry.workspace = true
 datafusion.workspace = true
-datafusion-common.workspace = true
 datafusion-expr.workspace = true
 datatypes.workspace = true
 futures.workspace = true
--- a/src/promql/src/functions.rs
+++ b/src/promql/src/functions.rs
@@ -20,7 +20,6 @@ mod holt_winters;
 mod idelta;
 mod predict_linear;
 mod quantile;
-mod quantile_aggr;
 mod resets;
 mod round;
 #[cfg(test)]
@@ -40,7 +39,6 @@ pub use holt_winters::HoltWinters;
 pub use idelta::IDelta;
 pub use predict_linear::PredictLinear;
 pub use quantile::QuantileOverTime;
-pub use quantile_aggr::quantile_udaf;
 pub use resets::Resets;
 pub use round::Round;

--- a/src/promql/src/functions/quantile.rs
+++ b/src/promql/src/functions/quantile.rs
@@ -125,7 +125,7 @@ impl QuantileOverTime {
 }

 /// Refer to <https://github.com/prometheus/prometheus/blob/6e2905a4d4ff9b47b1f6d201333f5bd53633f921/promql/quantile.go#L357-L386>
-pub(crate) fn quantile_impl(values: &[f64], quantile: f64) -> Option<f64> {
+fn quantile_impl(values: &[f64], quantile: f64) -> Option<f64> {
    if quantile.is_nan() || values.is_empty() {
        return Some(f64::NAN);
    }
--- a/src/promql/src/functions/quantile_aggr.rs
+++ b/src/promql/src/functions/quantile_aggr.rs
@@ -1,297 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use datafusion::arrow::array::{ArrayRef, AsArray};
-use datafusion::common::cast::{as_list_array, as_primitive_array, as_struct_array};
-use datafusion::error::Result as DfResult;
-use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF, Volatility};
-use datafusion::prelude::create_udaf;
-use datafusion_common::ScalarValue;
-use datatypes::arrow::array::{ListArray, StructArray};
-use datatypes::arrow::datatypes::{DataType, Field, Float64Type};
-
-use crate::functions::quantile::quantile_impl;
-
-const QUANTILE_NAME: &str = "quantile";
-
-const VALUES_FIELD_NAME: &str = "values";
-const DEFAULT_LIST_FIELD_NAME: &str = "item";
-
-#[derive(Debug, Default)]
-pub struct QuantileAccumulator {
-    q: f64,
-    values: Vec<Option<f64>>,
-}
-
-/// Create a quantile `AggregateUDF` for PromQL quantile operator,
-/// which calculates φ-quantile (0 ≤ φ ≤ 1) over dimensions
-pub fn quantile_udaf(q: f64) -> Arc<AggregateUDF> {
-    Arc::new(create_udaf(
-        QUANTILE_NAME,
-        // Input type: (values)
-        vec![DataType::Float64],
-        // Output type: the φ-quantile
-        Arc::new(DataType::Float64),
-        Volatility::Immutable,
-        // Create the accumulator
-        Arc::new(move |_| Ok(Box::new(QuantileAccumulator::new(q)))),
-        // Intermediate state types
-        Arc::new(vec![DataType::Struct(
-            vec![Field::new(
-                VALUES_FIELD_NAME,
-                DataType::List(Arc::new(Field::new(
-                    DEFAULT_LIST_FIELD_NAME,
-                    DataType::Float64,
-                    true,
-                ))),
-                false,
-            )]
-            .into(),
-        )]),
-    ))
-}
-
-impl QuantileAccumulator {
-    pub fn new(q: f64) -> Self {
-        Self {
-            q,
-            ..Default::default()
-        }
-    }
-}
-
-impl DfAccumulator for QuantileAccumulator {
-    fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
-        let f64_array = values[0].as_primitive::<Float64Type>();
-
-        self.values.extend(f64_array);
-
-        Ok(())
-    }
-
-    fn evaluate(&mut self) -> DfResult<ScalarValue> {
-        let values: Vec<_> = self.values.iter().map(|v| v.unwrap_or(0.0)).collect();
-
-        let result = quantile_impl(&values, self.q);
-
-        ScalarValue::new_primitive::<Float64Type>(result, &DataType::Float64)
-    }
-
-    fn size(&self) -> usize {
-        std::mem::size_of::<Self>() + self.values.capacity() * std::mem::size_of::<Option<f64>>()
-    }
-
-    fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
-        let values_array = Arc::new(ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
-            Some(self.values.clone()),
-        ]));
-
-        let state_struct = StructArray::new(
-            vec![Field::new(
-                VALUES_FIELD_NAME,
-                DataType::List(Arc::new(Field::new(
-                    DEFAULT_LIST_FIELD_NAME,
-                    DataType::Float64,
-                    true,
-                ))),
-                false,
-            )]
-            .into(),
-            vec![values_array],
-            None,
-        );
-
-        Ok(vec![ScalarValue::Struct(Arc::new(state_struct))])
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
-        if states.is_empty() {
-            return Ok(());
-        }
-
-        for state in states {
-            let state = as_struct_array(state)?;
-
-            for list in as_list_array(state.column(0))?.iter().flatten() {
-                let f64_array = as_primitive_array::<Float64Type>(&list)?.clone();
-                self.values.extend(&f64_array);
-            }
-        }
-
-        Ok(())
-    }
-}
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use datafusion::arrow::array::{ArrayRef, Float64Array};
-    use datafusion_common::ScalarValue;
-
-    use super::*;
-
-    fn create_f64_array(values: Vec<Option<f64>>) -> ArrayRef {
-        Arc::new(Float64Array::from(values)) as ArrayRef
-    }
-
-    #[test]
-    fn test_quantile_accumulator_empty() {
-        let mut accumulator = QuantileAccumulator::new(0.5);
-
-        let result = accumulator.evaluate().unwrap();
-
-        match result {
-            ScalarValue::Float64(_) => (),
-            _ => panic!("Expected Float64 scalar value"),
-        }
-    }
-
-    #[test]
-    fn test_quantile_accumulator_single_value() {
-        let mut accumulator = QuantileAccumulator::new(0.5);
-        let input = create_f64_array(vec![Some(10.0)]);
-
-        accumulator.update_batch(&[input]).unwrap();
-        let result = accumulator.evaluate().unwrap();
-
-        assert_eq!(result, ScalarValue::Float64(Some(10.0)));
-    }
-
-    #[test]
-    fn test_quantile_accumulator_multiple_values() {
-        let mut accumulator = QuantileAccumulator::new(0.5);
-        let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]);
-
-        accumulator.update_batch(&[input]).unwrap();
-        let result = accumulator.evaluate().unwrap();
-
-        assert_eq!(result, ScalarValue::Float64(Some(3.0)));
-    }
-
-    #[test]
-    fn test_quantile_accumulator_with_nulls() {
-        let mut accumulator = QuantileAccumulator::new(0.5);
-        let input = create_f64_array(vec![Some(1.0), None, Some(3.0), Some(4.0), Some(5.0)]);
-
-        accumulator.update_batch(&[input]).unwrap();
-
-        let result = accumulator.evaluate().unwrap();
-        assert_eq!(result, ScalarValue::Float64(Some(3.0)));
-    }
-
-    #[test]
-    fn test_quantile_accumulator_multiple_batches() {
-        let mut accumulator = QuantileAccumulator::new(0.5);
-        let input1 = create_f64_array(vec![Some(1.0), Some(2.0)]);
-        let input2 = create_f64_array(vec![Some(3.0), Some(4.0), Some(5.0)]);
-
-        accumulator.update_batch(&[input1]).unwrap();
-        accumulator.update_batch(&[input2]).unwrap();
-
-        let result = accumulator.evaluate().unwrap();
-        assert_eq!(result, ScalarValue::Float64(Some(3.0)));
-    }
-
-    #[test]
-    fn test_quantile_accumulator_different_quantiles() {
-        let mut min_accumulator = QuantileAccumulator::new(0.0);
-        let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]);
-        min_accumulator.update_batch(&[input.clone()]).unwrap();
-        assert_eq!(
-            min_accumulator.evaluate().unwrap(),
-            ScalarValue::Float64(Some(1.0))
-        );
-
-        let mut q1_accumulator = QuantileAccumulator::new(0.25);
-        q1_accumulator.update_batch(&[input.clone()]).unwrap();
-        assert_eq!(
-            q1_accumulator.evaluate().unwrap(),
-            ScalarValue::Float64(Some(2.0))
-        );
-
-        let mut q3_accumulator = QuantileAccumulator::new(0.75);
-        q3_accumulator.update_batch(&[input.clone()]).unwrap();
-        assert_eq!(
-            q3_accumulator.evaluate().unwrap(),
-            ScalarValue::Float64(Some(4.0))
-        );
-
-        let mut max_accumulator = QuantileAccumulator::new(1.0);
-        max_accumulator.update_batch(&[input]).unwrap();
-        assert_eq!(
-            max_accumulator.evaluate().unwrap(),
-            ScalarValue::Float64(Some(5.0))
-        );
-    }
-
-    #[test]
-    fn test_quantile_accumulator_size() {
-        let mut accumulator = QuantileAccumulator::new(0.5);
-        let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0)]);
-
-        let initial_size = accumulator.size();
-        accumulator.update_batch(&[input]).unwrap();
-        let after_update_size = accumulator.size();
-
-        assert!(after_update_size >= initial_size);
-    }
-
-    #[test]
-    fn test_quantile_accumulator_state_and_merge() -> DfResult<()> {
-        let mut acc1 = QuantileAccumulator::new(0.5);
-        let input1 = create_f64_array(vec![Some(1.0), Some(2.0)]);
-        acc1.update_batch(&[input1])?;
-
-        let state1 = acc1.state()?;
-
-        let mut acc2 = QuantileAccumulator::new(0.5);
-        let input2 = create_f64_array(vec![Some(3.0), Some(4.0), Some(5.0)]);
-        acc2.update_batch(&[input2])?;
-
-        let mut struct_builders = vec![];
-        for scalar in &state1 {
-            if let ScalarValue::Struct(struct_array) = scalar {
-                struct_builders.push(struct_array.clone() as ArrayRef);
-            }
-        }
-
-        acc2.merge_batch(&struct_builders)?;
-
-        let result = acc2.evaluate()?;
-
-        assert_eq!(result, ScalarValue::Float64(Some(3.0)));
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_quantile_accumulator_with_extreme_values() {
-        let mut accumulator = QuantileAccumulator::new(0.5);
-        let input = create_f64_array(vec![Some(f64::MAX), Some(f64::MIN), Some(0.0)]);
-
-        accumulator.update_batch(&[input]).unwrap();
-        let _result = accumulator.evaluate().unwrap();
-    }
-
-    #[test]
-    fn test_quantile_udaf_creation() {
-        let q = 0.5;
-        let udaf = quantile_udaf(q);
-
-        assert_eq!(udaf.name(), QUANTILE_NAME);
-        assert_eq!(udaf.return_type(&[]).unwrap(), DataType::Float64);
-    }
-}
--- a/src/query/src/dataframe.rs
+++ b/src/query/src/dataframe.rs
@@ -13,7 +13,6 @@
 // limitations under the License.

 use datafusion::dataframe::DataFrame as DfDataFrame;
-use datafusion_expr::LogicalPlan;

 /// DataFrame represents a logical set of rows with the same named columns.
 /// Similar to a Pandas DataFrame or Spark DataFrame
@@ -21,11 +20,3 @@ use datafusion_expr::LogicalPlan;
 pub enum DataFrame {
    DataFusion(DfDataFrame),
 }
-
-impl DataFrame {
-    pub fn into_logical_plan(self) -> LogicalPlan {
-        match self {
-            Self::DataFusion(dataframe) => dataframe.into_parts().1,
-        }
-    }
-}
--- a/src/query/src/dummy_catalog.rs
+++ b/src/query/src/dummy_catalog.rs
@@ -31,7 +31,7 @@ use datatypes::arrow::datatypes::SchemaRef;
 use snafu::ResultExt;
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::RegionEngineRef;
-use store_api::storage::{RegionId, ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
+use store_api::storage::{RegionId, ScanRequest, TimeSeriesRowSelector};
 use table::table::scan::RegionScanExec;

 use crate::error::{GetRegionMetadataSnafu, Result};
@@ -175,10 +175,10 @@ impl TableProvider for DummyTableProvider {

        let scanner = self
            .engine
-            .handle_query(self.region_id, request.clone())
+            .handle_query(self.region_id, request)
            .await
            .map_err(|e| DataFusionError::External(Box::new(e)))?;
-        Ok(Arc::new(RegionScanExec::new(scanner, request)?))
+        Ok(Arc::new(RegionScanExec::new(scanner)))
    }

    fn supports_filters_pushdown(
@@ -233,11 +233,6 @@ impl DummyTableProvider {
        self.scan_request.lock().unwrap().output_ordering = Some(order_opts.to_vec());
    }

-    /// Sets the distribution hint of the query to the provider.
-    pub fn with_distribution(&self, distribution: TimeSeriesDistribution) {
-        self.scan_request.lock().unwrap().distribution = Some(distribution);
-    }
-
    /// Sets the time series selector hint of the query to the provider.
    pub fn with_time_series_selector_hint(&self, selector: TimeSeriesRowSelector) {
        self.scan_request.lock().unwrap().series_row_selector = Some(selector);
--- a/src/query/src/optimizer/parallelize_scan.rs
+++ b/src/query/src/optimizer/parallelize_scan.rs
@@ -23,7 +23,6 @@ use datafusion::physical_plan::ExecutionPlan;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{DataFusionError, Result};
 use store_api::region_engine::PartitionRange;
-use store_api::storage::TimeSeriesDistribution;
 use table::table::scan::RegionScanExec;

 #[derive(Debug)]
@@ -66,14 +65,6 @@ impl ParallelizeScan {
                        return Ok(Transformed::no(plan));
                    }

-                    // don't parallelize if we want per series distribution
-                    if matches!(
-                        region_scan_exec.distribution(),
-                        Some(TimeSeriesDistribution::PerSeries)
-                    ) {
-                        return Ok(Transformed::no(plan));
-                    }
-
                    let ranges = region_scan_exec.get_partition_ranges();
                    let total_range_num = ranges.len();
                    let expected_partition_num = config.execution.target_partitions;
--- a/src/query/src/optimizer/scan_hint.rs
+++ b/src/query/src/optimizer/scan_hint.rs
@@ -23,7 +23,7 @@ use datafusion_common::{Column, Result};
 use datafusion_expr::expr::Sort;
 use datafusion_expr::{utils, Expr, LogicalPlan};
 use datafusion_optimizer::{OptimizerConfig, OptimizerRule};
-use store_api::storage::{TimeSeriesDistribution, TimeSeriesRowSelector};
+use store_api::storage::TimeSeriesRowSelector;

 use crate::dummy_catalog::DummyTableProvider;

@@ -121,36 +121,6 @@ impl ScanHintRule {
            });
        }
        adapter.with_ordering_hint(&opts);
-
-        let mut sort_expr_cursor = order_expr.iter().filter_map(|s| s.expr.try_as_col());
-        let region_metadata = adapter.region_metadata();
-        // ignore table without pk
-        if region_metadata.primary_key.is_empty() {
-            return;
-        }
-        let mut pk_column_iter = region_metadata.primary_key_columns();
-        let mut curr_sort_expr = sort_expr_cursor.next();
-        let mut curr_pk_col = pk_column_iter.next();
-
-        while let (Some(sort_expr), Some(pk_col)) = (curr_sort_expr, curr_pk_col) {
-            if sort_expr.name == pk_col.column_schema.name {
-                curr_sort_expr = sort_expr_cursor.next();
-                curr_pk_col = pk_column_iter.next();
-            } else {
-                return;
-            }
-        }
-
-        let next_remaining = sort_expr_cursor.next();
-        match (curr_sort_expr, next_remaining) {
-            (Some(expr), None)
-                if expr.name == region_metadata.time_index_column().column_schema.name =>
-            {
-                adapter.with_distribution(TimeSeriesDistribution::PerSeries);
-            }
-            (None, _) => adapter.with_distribution(TimeSeriesDistribution::PerSeries),
-            (Some(_), _) => {}
-        }
    }

    fn set_time_series_row_selector_hint(
--- a/src/query/src/parser.rs
+++ b/src/query/src/parser.rs
@@ -188,7 +188,7 @@ impl QueryLanguageParser {
        Ok(QueryStatement::Promql(eval_stmt))
    }

-    pub fn parse_promql_timestamp(timestamp: &str) -> Result<SystemTime> {
+    fn parse_promql_timestamp(timestamp: &str) -> Result<SystemTime> {
        // try rfc3339 format
        let rfc3339_result = DateTime::parse_from_rfc3339(timestamp)
            .context(ParseTimestampSnafu { raw: timestamp })
--- a/src/query/src/promql.rs
+++ b/src/query/src/promql.rs
@@ -12,6 +12,5 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-pub mod error;
-pub mod label_values;
+pub(crate) mod error;
 pub mod planner;
--- a/src/query/src/promql/label_values.rs
+++ b/src/query/src/promql/label_values.rs
@@ -1,107 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::time::{SystemTime, UNIX_EPOCH};
-
-use datafusion_common::{Column, ScalarValue};
-use datafusion_expr::expr::Alias;
-use datafusion_expr::utils::conjunction;
-use datafusion_expr::{col, Cast, Expr, LogicalPlan, LogicalPlanBuilder};
-use datafusion_sql::TableReference;
-use datatypes::arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit};
-use datatypes::prelude::ConcreteDataType;
-use snafu::{OptionExt, ResultExt};
-use table::TableRef;
-
-use crate::promql::error::{DataFusionPlanningSnafu, Result, TimeIndexNotFoundSnafu};
-
-fn build_time_filter(time_index_expr: Expr, start: i64, end: i64) -> Expr {
-    time_index_expr
-        .clone()
-        .gt_eq(Expr::Literal(ScalarValue::TimestampMillisecond(
-            Some(start),
-            None,
-        )))
-        .and(
-            time_index_expr.lt_eq(Expr::Literal(ScalarValue::TimestampMillisecond(
-                Some(end),
-                None,
-            ))),
-        )
-}
-
-/// Rewrite label values query to DataFusion logical plan.
-pub fn rewrite_label_values_query(
-    table: TableRef,
-    mut scan_plan: LogicalPlan,
-    mut conditions: Vec<Expr>,
-    label_name: String,
-    start: SystemTime,
-    end: SystemTime,
-) -> Result<LogicalPlan> {
-    let table_ref = TableReference::partial(
-        table.table_info().schema_name.as_str(),
-        table.table_info().name.as_str(),
-    );
-    let schema = table.schema();
-    let ts_column = schema
-        .timestamp_column()
-        .with_context(|| TimeIndexNotFoundSnafu {
-            table: table.table_info().full_table_name(),
-        })?;
-
-    let is_time_index_ms =
-        ts_column.data_type == ConcreteDataType::timestamp_millisecond_datatype();
-    let time_index_expr = col(Column::from_name(ts_column.name.clone()));
-
-    if !is_time_index_ms {
-        // cast to ms if time_index not in Millisecond precision
-        let expr = vec![
-            col(Column::from_name(label_name.clone())),
-            Expr::Alias(Alias {
-                expr: Box::new(Expr::Cast(Cast {
-                    expr: Box::new(time_index_expr.clone()),
-                    data_type: ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None),
-                })),
-                relation: Some(table_ref),
-                name: ts_column.name.clone(),
-            }),
-        ];
-        scan_plan = LogicalPlanBuilder::from(scan_plan)
-            .project(expr)
-            .context(DataFusionPlanningSnafu)?
-            .build()
-            .context(DataFusionPlanningSnafu)?;
-    };
-
-    let start = start.duration_since(UNIX_EPOCH).unwrap().as_millis() as i64;
-    let end = end.duration_since(UNIX_EPOCH).unwrap().as_millis() as i64;
-
-    conditions.push(build_time_filter(time_index_expr, start, end));
-    // Safety: `conditions` is not empty.
-    let filter = conjunction(conditions).unwrap();
-
-    // Builds time filter
-    let logical_plan = LogicalPlanBuilder::from(scan_plan)
-        .filter(filter)
-        .context(DataFusionPlanningSnafu)?
-        .project(vec![col(Column::from_name(label_name))])
-        .context(DataFusionPlanningSnafu)?
-        .distinct()
-        .context(DataFusionPlanningSnafu)?
-        .build()
-        .context(DataFusionPlanningSnafu)?;
-
-    Ok(logical_plan)
-}
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -51,8 +51,8 @@ use promql::extension_plan::{
    RangeManipulate, ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn,
 };
 use promql::functions::{
-    quantile_udaf, AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters,
-    IDelta, Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
+    AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters, IDelta,
+    Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
    QuantileOverTime, Rate, Resets, Round, StddevOverTime, StdvarOverTime, SumOverTime,
 };
 use promql_parser::label::{MatchOp, Matcher, Matchers, METRIC_NAME};
@@ -266,10 +266,7 @@ impl PromPlanner {
        aggr_expr: &AggregateExpr,
    ) -> Result<LogicalPlan> {
        let AggregateExpr {
-            op,
-            expr,
-            modifier,
-            param,
+            op, expr, modifier, ..
        } = aggr_expr;

        let input = self.prom_expr_to_plan(expr, session_state).await?;
@@ -280,40 +277,19 @@ impl PromPlanner {
            _ => {
                // calculate columns to group by
                // Need to append time index column into group by columns
-                let mut group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
+                let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
                // convert op and value columns to aggregate exprs
-                let (aggr_exprs, prev_field_exprs) =
-                    self.create_aggregate_exprs(*op, param, &input)?;
+                let aggr_exprs = self.create_aggregate_exprs(*op, &input)?;

                // create plan
-                let builder = LogicalPlanBuilder::from(input);
-                let builder = if op.id() == token::T_COUNT_VALUES {
-                    let label = Self::get_param_value_as_str(*op, param)?;
-                    // `count_values` must be grouped by fields,
-                    // and project the fields to the new label.
-                    group_exprs.extend(prev_field_exprs.clone());
-                    let project_fields = self
-                        .create_field_column_exprs()?
-                        .into_iter()
-                        .chain(self.create_tag_column_exprs()?)
-                        .chain(Some(self.create_time_index_column_expr()?))
-                        .chain(prev_field_exprs.into_iter().map(|expr| expr.alias(label)));
-
-                    builder
-                        .aggregate(group_exprs.clone(), aggr_exprs)
-                        .context(DataFusionPlanningSnafu)?
-                        .project(project_fields)
-                        .context(DataFusionPlanningSnafu)?
-                } else {
-                    builder
-                        .aggregate(group_exprs.clone(), aggr_exprs)
-                        .context(DataFusionPlanningSnafu)?
-                };
-
-                let sort_expr = group_exprs.into_iter().map(|expr| expr.sort(true, false));
-
-                builder
-                    .sort(sort_expr)
+                let group_sort_expr = group_exprs
+                    .clone()
+                    .into_iter()
+                    .map(|expr| expr.sort(true, false));
+                LogicalPlanBuilder::from(input)
+                    .aggregate(group_exprs.clone(), aggr_exprs)
+                    .context(DataFusionPlanningSnafu)?
+                    .sort(group_sort_expr)
                    .context(DataFusionPlanningSnafu)?
                    .build()
                    .context(DataFusionPlanningSnafu)
@@ -336,7 +312,18 @@ impl PromPlanner {

        let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, false)?;

-        let val = Self::get_param_value_as_f64(*op, param)?;
+        let param = param
+            .as_deref()
+            .with_context(|| FunctionInvalidArgumentSnafu {
+                fn_name: (*op).to_string(),
+            })?;
+
+        let PromExpr::NumberLiteral(NumberLiteral { val }) = param else {
+            return FunctionInvalidArgumentSnafu {
+                fn_name: (*op).to_string(),
+            }
+            .fail();
+        };

        // convert op and value columns to window exprs.
        let window_exprs = self.create_window_exprs(*op, group_exprs.clone(), &input)?;
@@ -354,7 +341,7 @@ impl PromPlanner {
                let predicate = DfExpr::BinaryExpr(BinaryExpr {
                    left: Box::new(col(rank)),
                    op: Operator::LtEq,
-                    right: Box::new(lit(val)),
+                    right: Box::new(lit(*val)),
                });

                match expr {
@@ -939,7 +926,7 @@ impl PromPlanner {
            Some(Offset::Neg(duration)) => -(duration.as_millis() as Millisecond),
            None => 0,
        };
-        let mut scan_filters = Self::matchers_to_expr(label_matchers.clone(), table_schema)?;
+        let mut scan_filters = self.matchers_to_expr(label_matchers.clone(), table_schema)?;
        if let Some(time_index_filter) = self.build_time_index_filter(offset_duration)? {
            scan_filters.push(time_index_filter);
        }
@@ -1135,7 +1122,8 @@ impl PromPlanner {
    }

    // TODO(ruihang): ignore `MetricNameLabel` (`__name__`) matcher
-    pub fn matchers_to_expr(
+    fn matchers_to_expr(
+        &self,
        label_matchers: Matchers,
        table_schema: &DFSchemaRef,
    ) -> Result<Vec<DfExpr>> {
@@ -1943,44 +1931,32 @@ impl PromPlanner {
        })
    }

-    /// Creates a set of DataFusion `DfExpr::AggregateFunction` expressions for each value column using the specified aggregate function.
+    /// Create [DfExpr::AggregateFunction] expr for each value column with given aggregate function.
    ///
-    /// # Side Effects
-    ///
-    /// This method modifies the value columns in the context by replacing them with the new columns
-    /// created by the aggregate function application.
-    ///
-    /// # Returns
-    ///
-    /// Returns a tuple of `(aggregate_expressions, previous_field_expressions)` where:
-    /// - `aggregate_expressions`: Expressions that apply the aggregate function to the original fields
-    /// - `previous_field_expressions`: Original field expressions before aggregation. This is non-empty
-    ///   only when the operation is `count_values`, as this operation requires preserving the original
-    ///   values for grouping.
+    /// # Side effect
    ///
+    /// This method will update value columns in context to the new value columns created by
+    /// aggregate function.
    fn create_aggregate_exprs(
        &mut self,
        op: TokenType,
-        param: &Option<Box<PromExpr>>,
        input_plan: &LogicalPlan,
-    ) -> Result<(Vec<DfExpr>, Vec<DfExpr>)> {
+    ) -> Result<Vec<DfExpr>> {
        let aggr = match op.id() {
            token::T_SUM => sum_udaf(),
-            token::T_QUANTILE => {
-                let q = Self::get_param_value_as_f64(op, param)?;
-                quantile_udaf(q)
-            }
            token::T_AVG => avg_udaf(),
-            token::T_COUNT_VALUES | token::T_COUNT => count_udaf(),
+            token::T_COUNT => count_udaf(),
            token::T_MIN => min_udaf(),
            token::T_MAX => max_udaf(),
            token::T_GROUP => grouping_udaf(),
            token::T_STDDEV => stddev_pop_udaf(),
            token::T_STDVAR => var_pop_udaf(),
-            token::T_TOPK | token::T_BOTTOMK => UnsupportedExprSnafu {
-                name: format!("{op:?}"),
+            token::T_TOPK | token::T_BOTTOMK | token::T_COUNT_VALUES | token::T_QUANTILE => {
+                UnsupportedExprSnafu {
+                    name: format!("{op:?}"),
+                }
+                .fail()?
            }
-            .fail()?,
            _ => UnexpectedTokenSnafu { token: op }.fail()?,
        };

@@ -1990,41 +1966,19 @@ impl PromPlanner {
            .field_columns
            .iter()
            .map(|col| {
-                Ok(DfExpr::AggregateFunction(AggregateFunction {
+                DfExpr::AggregateFunction(AggregateFunction {
                    func: aggr.clone(),
                    args: vec![DfExpr::Column(Column::from_name(col))],
                    distinct: false,
                    filter: None,
                    order_by: None,
                    null_treatment: None,
-                }))
+                })
            })
-            .collect::<Result<Vec<_>>>()?;
+            .collect();

-        // if the aggregator is `count_values`, it must be grouped by current fields.
-        let prev_field_exprs = if op.id() == token::T_COUNT_VALUES {
-            let prev_field_exprs: Vec<_> = self
-                .ctx
-                .field_columns
-                .iter()
-                .map(|col| DfExpr::Column(Column::from_name(col)))
-                .collect();
-
-            ensure!(
-                self.ctx.field_columns.len() == 1,
-                UnsupportedExprSnafu {
-                    name: "count_values on multi-value input"
-                }
-            );
-
-            prev_field_exprs
-        } else {
-            vec![]
-        };
-
-        // update value column name according to the aggregators,
+        // update value column name according to the aggregators
        let mut new_field_columns = Vec::with_capacity(self.ctx.field_columns.len());
-
        let normalized_exprs =
            normalize_cols(exprs.iter().cloned(), input_plan).context(DataFusionPlanningSnafu)?;
        for expr in normalized_exprs {
@@ -2032,39 +1986,7 @@ impl PromPlanner {
        }
        self.ctx.field_columns = new_field_columns;

-        Ok((exprs, prev_field_exprs))
-    }
-
-    fn get_param_value_as_str(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<&str> {
-        let param = param
-            .as_deref()
-            .with_context(|| FunctionInvalidArgumentSnafu {
-                fn_name: op.to_string(),
-            })?;
-        let PromExpr::StringLiteral(StringLiteral { val }) = param else {
-            return FunctionInvalidArgumentSnafu {
-                fn_name: op.to_string(),
-            }
-            .fail();
-        };
-
-        Ok(val)
-    }
-
-    fn get_param_value_as_f64(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<f64> {
-        let param = param
-            .as_deref()
-            .with_context(|| FunctionInvalidArgumentSnafu {
-                fn_name: op.to_string(),
-            })?;
-        let PromExpr::NumberLiteral(NumberLiteral { val }) = param else {
-            return FunctionInvalidArgumentSnafu {
-                fn_name: op.to_string(),
-            }
-            .fail();
-        };
-
-        Ok(*val)
+        Ok(exprs)
    }

    /// Create [DfExpr::WindowFunction] expr for each value column with given window function.
@@ -3420,6 +3342,30 @@ mod test {
        do_aggregate_expr_plan("stdvar", "var_pop").await;
    }

+    #[tokio::test]
+    #[should_panic]
+    async fn aggregate_top_k() {
+        do_aggregate_expr_plan("topk", "").await;
+    }
+
+    #[tokio::test]
+    #[should_panic]
+    async fn aggregate_bottom_k() {
+        do_aggregate_expr_plan("bottomk", "").await;
+    }
+
+    #[tokio::test]
+    #[should_panic]
+    async fn aggregate_count_values() {
+        do_aggregate_expr_plan("count_values", "").await;
+    }
+
+    #[tokio::test]
+    #[should_panic]
+    async fn aggregate_quantile() {
+        do_aggregate_expr_plan("quantile", "").await;
+    }
+
    // TODO(ruihang): add range fn tests once exprs are ready.

    // {
@@ -4302,98 +4248,4 @@ mod test {

        assert_eq!(plan.display_indent_schema().to_string(), expected);
    }
-
-    #[tokio::test]
-    async fn test_count_values_expr() {
-        let mut eval_stmt = EvalStmt {
-            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
-            start: UNIX_EPOCH,
-            end: UNIX_EPOCH
-                .checked_add(Duration::from_secs(100_000))
-                .unwrap(),
-            interval: Duration::from_secs(5),
-            lookback_delta: Duration::from_secs(1),
-        };
-        let case = r#"count_values('series', prometheus_tsdb_head_series{ip=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237:9090)"}) by (ip)"#;
-
-        let prom_expr = parser::parse(case).unwrap();
-        eval_stmt.expr = prom_expr;
-        let table_provider = build_test_table_provider_with_fields(
-            &[
-                (
-                    DEFAULT_SCHEMA_NAME.to_string(),
-                    "prometheus_tsdb_head_series".to_string(),
-                ),
-                (
-                    DEFAULT_SCHEMA_NAME.to_string(),
-                    "http_server_requests_seconds_count".to_string(),
-                ),
-            ],
-            &["ip"],
-        )
-        .await;
-
-        let plan = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_session_state())
-            .await
-            .unwrap();
-        let expected = r#"Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, series [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N]
-  Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, prometheus_tsdb_head_series.greptime_value ASC NULLS LAST [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N, greptime_value:Float64;N]
-    Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value AS series, prometheus_tsdb_head_series.greptime_value [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N, greptime_value:Float64;N]
-      Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value]], aggr=[[count(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N, count(prometheus_tsdb_head_series.greptime_value):Int64]
-        PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-          PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [false] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-            PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-              Sort: prometheus_tsdb_head_series.ip DESC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp DESC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-                Filter: prometheus_tsdb_head_series.ip ~ Utf8("(10\.0\.160\.237:8080|10\.0\.160\.237:9090)") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-1000, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100001000, None) [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-                  TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]"#;
-
-        assert_eq!(plan.display_indent_schema().to_string(), expected);
-    }
-
-    #[tokio::test]
-    async fn test_quantile_expr() {
-        let mut eval_stmt = EvalStmt {
-            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
-            start: UNIX_EPOCH,
-            end: UNIX_EPOCH
-                .checked_add(Duration::from_secs(100_000))
-                .unwrap(),
-            interval: Duration::from_secs(5),
-            lookback_delta: Duration::from_secs(1),
-        };
-        let case = r#"quantile(0.3, sum(prometheus_tsdb_head_series{ip=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237:9090)"}) by (ip))"#;
-
-        let prom_expr = parser::parse(case).unwrap();
-        eval_stmt.expr = prom_expr;
-        let table_provider = build_test_table_provider_with_fields(
-            &[
-                (
-                    DEFAULT_SCHEMA_NAME.to_string(),
-                    "prometheus_tsdb_head_series".to_string(),
-                ),
-                (
-                    DEFAULT_SCHEMA_NAME.to_string(),
-                    "http_server_requests_seconds_count".to_string(),
-                ),
-            ],
-            &["ip"],
-        )
-        .await;
-
-        let plan = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_session_state())
-            .await
-            .unwrap();
-        let expected = r#"Sort: prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [greptime_timestamp:Timestamp(Millisecond, None), quantile(sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]
-  Aggregate: groupBy=[[prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[quantile(sum(prometheus_tsdb_head_series.greptime_value))]] [greptime_timestamp:Timestamp(Millisecond, None), quantile(sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]
-    Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]
-      Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[sum(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]
-        PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-          PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [false] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-            PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-              Sort: prometheus_tsdb_head_series.ip DESC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp DESC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-                Filter: prometheus_tsdb_head_series.ip ~ Utf8("(10\.0\.160\.237:8080|10\.0\.160\.237:9090)") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-1000, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100001000, None) [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
-                  TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]"#;
-
-        assert_eq!(plan.display_indent_schema().to_string(), expected);
-    }
 }
--- a/src/servers/src/error.rs
+++ b/src/servers/src/error.rs
@@ -410,15 +410,6 @@ pub enum Error {
        source: query::error::Error,
    },

-    #[snafu(display("Failed to parse timestamp: {}", timestamp))]
-    ParseTimestamp {
-        timestamp: String,
-        #[snafu(implicit)]
-        location: Location,
-        #[snafu(source)]
-        error: query::error::Error,
-    },
-
    #[snafu(display("{}", reason))]
    UnexpectedResult {
        reason: String,
@@ -694,8 +685,7 @@ impl ErrorExt for Error {
            | PrepareStatementNotFound { .. }
            | FailedToParseQuery { .. }
            | InvalidElasticsearchInput { .. }
-            | InvalidJaegerQuery { .. }
-            | ParseTimestamp { .. } => StatusCode::InvalidArguments,
+            | InvalidJaegerQuery { .. } => StatusCode::InvalidArguments,

            Catalog { source, .. } => source.status_code(),
            RowWriter { source, .. } => source.status_code(),
--- a/src/servers/src/http/prometheus.rs
+++ b/src/servers/src/http/prometheus.rs
@@ -29,7 +29,7 @@ use common_time::util::{current_time_rfc3339, yesterday_rfc3339};
 use common_version::OwnedBuildInfo;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::scalars::ScalarVector;
-use datatypes::vectors::Float64Vector;
+use datatypes::vectors::{Float64Vector, StringVector};
 use futures::future::join_all;
 use futures::StreamExt;
 use promql_parser::label::{MatchOp, Matcher, Matchers, METRIC_NAME};
@@ -38,7 +38,7 @@ use promql_parser::parser::{
    AggregateExpr, BinaryExpr, Call, Expr as PromqlExpr, MatrixSelector, ParenExpr, SubqueryExpr,
    UnaryExpr, VectorSelector,
 };
-use query::parser::{PromQuery, QueryLanguageParser, DEFAULT_LOOKBACK_STRING};
+use query::parser::{PromQuery, DEFAULT_LOOKBACK_STRING};
 use query::promql::planner::normalize_matcher;
 use serde::de::{self, MapAccess, Visitor};
 use serde::{Deserialize, Serialize};
@@ -51,8 +51,8 @@ use store_api::metric_engine_consts::{

 pub use super::result::prometheus_resp::PrometheusJsonResponse;
 use crate::error::{
-    CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, ParseTimestampSnafu, Result,
-    TableNotFoundSnafu, UnexpectedResultSnafu,
+    CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, Result, TableNotFoundSnafu,
+    UnexpectedResultSnafu,
 };
 use crate::http::header::collect_plan_metrics;
 use crate::prom_store::{FIELD_NAME_LABEL, METRIC_NAME_LABEL};
@@ -994,58 +994,44 @@ pub async fn label_values_query(

    let start = params.start.unwrap_or_else(yesterday_rfc3339);
    let end = params.end.unwrap_or_else(current_time_rfc3339);
+    let lookback = params
+        .lookback
+        .unwrap_or_else(|| DEFAULT_LOOKBACK_STRING.to_string());
+
    let mut label_values = HashSet::new();

-    let start = try_call_return_response!(QueryLanguageParser::parse_promql_timestamp(&start)
-        .context(ParseTimestampSnafu { timestamp: &start }));
-    let end = try_call_return_response!(QueryLanguageParser::parse_promql_timestamp(&end)
-        .context(ParseTimestampSnafu { timestamp: &end }));
-
+    let mut merge_map = HashMap::new();
    for query in queries {
-        let promql_expr = try_call_return_response!(promql_parser::parser::parse(&query));
-        let PromqlExpr::VectorSelector(VectorSelector { name, matchers, .. }) = promql_expr else {
-            return PrometheusJsonResponse::error(
-                StatusCode::InvalidArguments,
-                "expected vector selector",
-            );
+        let prom_query = PromQuery {
+            query,
+            start: start.clone(),
+            end: end.clone(),
+            step: DEFAULT_LOOKBACK_STRING.to_string(),
+            lookback: lookback.clone(),
        };
-        let Some(name) = name else {
-            return PrometheusJsonResponse::error(
-                StatusCode::InvalidArguments,
-                "expected metric name",
-            );
-        };
-        // Only use and filter matchers.
-        let matchers = matchers.matchers;
-        let result = handler
-            .query_label_values(
-                name,
-                label_name.to_string(),
-                matchers,
-                start,
-                end,
-                &query_ctx,
-            )
-            .await;
-
-        match result {
-            Ok(result) => {
-                label_values.extend(result.into_iter());
-            }
-            Err(err) => {
-                // Prometheus won't report error if querying nonexist label and metric
-                if err.status_code() != StatusCode::TableNotFound
-                    && err.status_code() != StatusCode::TableColumnNotFound
-                {
-                    return PrometheusJsonResponse::error(err.status_code(), err.output_msg());
-                }
+        let result = handler.do_query(&prom_query, query_ctx.clone()).await;
+        if let Err(err) =
+            retrieve_label_values(result, &label_name, &mut label_values, &mut merge_map).await
+        {
+            // Prometheus won't report error if querying nonexist label and metric
+            if err.status_code() != StatusCode::TableNotFound
+                && err.status_code() != StatusCode::TableColumnNotFound
+            {
+                return PrometheusJsonResponse::error(err.status_code(), err.output_msg());
            }
        }
    }

+    let merge_map = merge_map
+        .into_iter()
+        .map(|(k, v)| (k, Value::from(v)))
+        .collect();
+
    let mut label_values: Vec<_> = label_values.into_iter().collect();
    label_values.sort_unstable();
-    PrometheusJsonResponse::success(PrometheusResponse::LabelValues(label_values))
+    let mut resp = PrometheusJsonResponse::success(PrometheusResponse::LabelValues(label_values));
+    resp.resp_metrics = merge_map;
+    resp
 }

 async fn retrieve_field_names(
@@ -1090,6 +1076,71 @@ async fn retrieve_field_names(
    Ok(field_columns)
 }

+async fn retrieve_label_values(
+    result: Result<Output>,
+    label_name: &str,
+    labels_values: &mut HashSet<String>,
+    metrics: &mut HashMap<String, u64>,
+) -> Result<()> {
+    let result = result?;
+    match result.data {
+        OutputData::RecordBatches(batches) => {
+            retrieve_label_values_from_record_batch(batches, label_name, labels_values).await
+        }
+        OutputData::Stream(stream) => {
+            let batches = RecordBatches::try_collect(stream)
+                .await
+                .context(CollectRecordbatchSnafu)?;
+            retrieve_label_values_from_record_batch(batches, label_name, labels_values).await
+        }
+        OutputData::AffectedRows(_) => UnexpectedResultSnafu {
+            reason: "expected data result, but got affected rows".to_string(),
+        }
+        .fail(),
+    }?;
+
+    if let Some(ref plan) = result.meta.plan {
+        collect_plan_metrics(plan, &mut [metrics]);
+    }
+
+    Ok(())
+}
+
+async fn retrieve_label_values_from_record_batch(
+    batches: RecordBatches,
+    label_name: &str,
+    labels_values: &mut HashSet<String>,
+) -> Result<()> {
+    let Some(label_col_idx) = batches.schema().column_index_by_name(label_name) else {
+        return Ok(());
+    };
+
+    // check whether label_name belongs to tag column
+    match batches
+        .schema()
+        .column_schema_by_name(label_name)
+        .unwrap()
+        .data_type
+    {
+        ConcreteDataType::String(_) => {}
+        _ => return Ok(()),
+    }
+    for batch in batches.iter() {
+        let label_column = batch
+            .column(label_col_idx)
+            .as_any()
+            .downcast_ref::<StringVector>()
+            .unwrap();
+        for row_index in 0..batch.num_rows() {
+            if let Some(label_value) = label_column.get_data(row_index) {
+                let _ = labels_values.insert(label_value.to_string());
+            }
+        }
+    }
+
+    Ok(())
+}
+
 /// Try to parse and extract the name of referenced metric from the promql query.
 ///
 /// Returns the metric name if a single metric is referenced, otherwise None.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
evenyag	d5760a7348	chore: remove unused codes	2025-03-17 15:20:42 +08:00
discord9	bc9614e22c	feat: file parallel	2025-03-10 21:00:40 +08:00
discord9	7dd9e98ff6	docs: chore	2025-03-10 16:12:28 +08:00
evenyag	fb6b7f7801	fix: use label value to add map	2025-03-10 15:17:59 +08:00
evenyag	87d7c316df	fix: use label value as table name	2025-03-10 14:42:19 +08:00
evenyag	c80a73bc20	feat: use pb in parquet	2025-03-10 14:40:29 +08:00
discord9	dd9d13e7df	fix: cli arg	2025-03-10 14:18:35 +08:00
evenyag	79d249f5fa	feat: fix panic in TimeSeriesParquetReader	2025-03-10 14:13:37 +08:00
evenyag	63bc544514	refactor: use constant	2025-03-10 14:02:27 +08:00
evenyag	30c29539a3	feat: special handle metric engine path	2025-03-10 13:58:46 +08:00
evenyag	359da62d9e	feat: use parquet	2025-03-10 13:36:49 +08:00
evenyag	c9f4b36360	fix: use flushed_sequence as we can't set sequence in ingester	2025-03-10 13:36:49 +08:00
discord9	85c346b16a	chore: progress bar	2025-03-10 11:53:33 +08:00
discord9	738c23beb0	feat: time unit	2025-03-10 11:25:23 +08:00
evenyag	8aadd1e59a	feat: parquet remote write reader	2025-03-09 23:42:08 +08:00
discord9	cbd58291da	chore: more logs	2025-03-09 23:29:58 +08:00
evenyag	e522e8959b	chore: add more logs	2025-03-09 21:19:55 +08:00
evenyag	7183a93e5a	feat: sanitize mito config	2025-03-09 21:05:21 +08:00
evenyag	8c538622e2	feat: add logs	2025-03-09 20:52:02 +08:00
evenyag	142dacb2c8	chore: update fs object build	2025-03-09 20:52:02 +08:00
discord9	371afc458f	chore: init logging	2025-03-09 20:44:53 +08:00
discord9	0751cd74c0	feat: all in one cfg	2025-03-09 20:36:10 +08:00
discord9	ec34e8739a	fix: is file	2025-03-09 19:55:12 +08:00
evenyag	b650743785	feat: implement converter convert	2025-03-09 19:53:36 +08:00
discord9	80a8b2e1bd	feat: debug output file option	2025-03-09 17:23:14 +08:00
discord9	ec8a15cadd	feat: ingester(WIP)	2025-03-09 16:57:26 +08:00
evenyag	f929d751a5	feat: update api	2025-03-09 16:39:35 +08:00
evenyag	fad3621a7a	feat: define converter api	2025-03-09 16:05:52 +08:00
evenyag	87723effc7	feat: declare converter	2025-03-09 15:33:49 +08:00
evenyag	62a333ad09	feat: import datanode	2025-03-09 15:32:02 +08:00
evenyag	6ad186a13e	feat: series to batch	2025-03-09 15:09:13 +08:00
discord9	77dee84a75	fix: parquet also sort by pk	2025-03-09 14:47:34 +08:00
evenyag	a57e263e5a	feat: sort time series	2025-03-08 22:20:13 +08:00
discord9	8796ddaf31	chore: remove unwrap	2025-03-08 20:32:11 +08:00
discord9	7fa3fbdfef	feat: parquet reader	2025-03-08 20:27:44 +08:00
jeremyhi	457d2a620c	feat: add get table api	2025-03-08 19:53:15 +08:00
evenyag	9f14edbb28	feat: implement sst writer	2025-03-08 17:22:03 +08:00
evenyag	cb3fad0c2d	chore: add deps	2025-03-08 16:17:49 +08:00
evenyag	2d1e7c2441	feat: init the converter crate	2025-03-08 14:15:35 +08:00