Merge branch 'main' into chore/rust-0321

2026-05-21 15:30:40 +00:00 · 2026-03-25 09:58:41 +08:00
parent 9a4e8c7a60 13cdfa9b59
commit a6e3ba706f
65 changed files with 3223 additions and 699 deletions
--- a/.github/scripts/upload-artifacts-to-s3.sh
+++ b/.github/scripts/upload-artifacts-to-s3.sh
@@ -33,7 +33,7 @@ function upload_artifacts() {
  #    └── greptime-darwin-amd64-v0.2.0.tar.gz
  find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
    filename=$(basename "$file")
-    TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION/$filename"
+    TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION"

    curl -X PUT \
      -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
@@ -49,7 +49,7 @@ function update_version_info() {
    if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
      echo "Updating latest-version.txt"
      echo "$VERSION" > latest-version.txt
-      TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-version.txt"
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS"

      curl -X PUT \
        -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
@@ -62,7 +62,7 @@ function update_version_info() {
      echo "Updating latest-nightly-version.txt"
      echo "$VERSION" > latest-nightly-version.txt

-      TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-nightly-version.txt"
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
      curl -X PUT \
        -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
        -F "file=@latest-nightly-version.txt" \
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
 "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.5",
 ]

 [[package]]
@@ -11635,9 +11635,9 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.103.3"
+version = "0.103.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
+checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
 dependencies = [
 "ring",
 "rustls-pki-types",
@@ -13404,9 +13404,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"

 [[package]]
 name = "tar"
-version = "0.4.44"
+version = "0.4.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a"
+checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
 dependencies = [
 "filetime",
 "libc",
--- a/src/catalog/src/kvbackend/table_cache.rs
+++ b/src/catalog/src/kvbackend/table_cache.rs
@@ -65,11 +65,13 @@ fn init_factory(

 fn invalidator<'a>(
    cache: &'a Cache<TableName, TableRef>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, MetaResult<()>> {
    Box::pin(async move {
-        if let CacheIdent::TableName(table_name) = ident {
-            cache.invalidate(table_name).await
+        for ident in idents {
+            if let CacheIdent::TableName(table_name) = ident {
+                cache.invalidate(table_name).await
+            }
        }
        Ok(())
    })
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 testing = []
 pg_kvbackend = [
    "dep:tokio-postgres",
-    "dep:backon",
    "dep:deadpool-postgres",
    "dep:deadpool",
    "dep:tokio-postgres-rustls",
@@ -16,7 +15,7 @@ pg_kvbackend = [
    "dep:rustls-native-certs",
    "dep:rustls",
 ]
-mysql_kvbackend = ["dep:sqlx", "dep:backon"]
+mysql_kvbackend = ["dep:sqlx"]
 enterprise = ["prost-types"]

 [lints]
@@ -28,7 +27,7 @@ api.workspace = true
 async-recursion = "1.0"
 async-stream.workspace = true
 async-trait.workspace = true
-backon = { workspace = true, optional = true }
+backon.workspace = true
 base64.workspace = true
 bytes.workspace = true
 chrono.workspace = true
--- a/src/common/meta/src/cache/container.rs
+++ b/src/common/meta/src/cache/container.rs
@@ -15,10 +15,14 @@
 use std::borrow::Borrow;
 use std::hash::Hash;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;

-use futures::future::{BoxFuture, join_all};
+use backon::{BackoffBuilder, ExponentialBuilder};
+use futures::future::BoxFuture;
 use moka::future::Cache;
 use snafu::{OptionExt, ResultExt};
+use tokio::time::sleep;

 use crate::cache_invalidator::{CacheInvalidator, Context};
 use crate::error::{self, Error, Result};
@@ -29,12 +33,29 @@ use crate::metrics;
 pub type TokenFilter<CacheToken> = Box<dyn Fn(&CacheToken) -> bool + Send + Sync>;

 /// Invalidates cached values by [CacheToken]s.
-pub type Invalidator<K, V, CacheToken> =
-    Box<dyn for<'a> Fn(&'a Cache<K, V>, &'a CacheToken) -> BoxFuture<'a, Result<()>> + Send + Sync>;
+pub type Invalidator<K, V, CacheToken> = Box<
+    dyn for<'a> Fn(&'a Cache<K, V>, &'a [&CacheToken]) -> BoxFuture<'a, Result<()>> + Send + Sync,
+>;

 /// Initializes value (i.e., fetches from remote).
 pub type Initializer<K, V> = Arc<dyn Fn(&'_ K) -> BoxFuture<'_, Result<Option<V>>> + Send + Sync>;

+#[derive(Debug, Clone, Copy)]
+/// Initialization strategy for cache-miss loading.
+///
+/// This strategy is selected when building [CacheContainer] and remains immutable
+/// for the lifetime of the container instance.
+pub enum InitStrategy {
+    /// Fast path: load once without version conflict retry.
+    ///
+    /// Under concurrent invalidation, callers may observe stale/dirty value.
+    Unchecked,
+    /// Strict path: retry load when version changes during initialization.
+    ///
+    /// This avoids returning dirty value under invalidate/load races.
+    VersionChecked,
+}
+
 /// [CacheContainer] provides ability to:
 /// - Cache value loaded by [Initializer].
 /// - Invalidate caches by [Invalidator].
@@ -44,6 +65,16 @@ pub struct CacheContainer<K, V, CacheToken> {
    invalidator: Invalidator<K, V, CacheToken>,
    initializer: Initializer<K, V>,
    token_filter: fn(&CacheToken) -> bool,
+    version: Arc<AtomicUsize>,
+    init_strategy: InitStrategy,
+}
+
+fn latest_get_backoff() -> impl Iterator<Item = Duration> {
+    ExponentialBuilder::default()
+        .with_min_delay(Duration::from_millis(10))
+        .with_max_delay(Duration::from_millis(100))
+        .with_max_times(3)
+        .build()
 }

 impl<K, V, CacheToken> CacheContainer<K, V, CacheToken>
@@ -52,13 +83,37 @@ where
    V: Send + Sync,
    CacheToken: Send + Sync,
 {
-    /// Constructs an [CacheContainer].
+    /// Constructs an [CacheContainer] with [InitStrategy::Unchecked].
+    ///
+    /// This keeps the historical behavior and can return stale/dirty value under
+    /// concurrent invalidation.
    pub fn new(
        name: String,
        cache: Cache<K, V>,
        invalidator: Invalidator<K, V, CacheToken>,
        initializer: Initializer<K, V>,
        token_filter: fn(&CacheToken) -> bool,
+    ) -> Self {
+        Self::with_strategy(
+            name,
+            cache,
+            invalidator,
+            initializer,
+            token_filter,
+            InitStrategy::Unchecked,
+        )
+    }
+
+    /// Constructs an [CacheContainer] with explicit [InitStrategy].
+    ///
+    /// The strategy is fixed at construction time and cannot be changed later.
+    pub fn with_strategy(
+        name: String,
+        cache: Cache<K, V>,
+        invalidator: Invalidator<K, V, CacheToken>,
+        initializer: Initializer<K, V>,
+        token_filter: fn(&CacheToken) -> bool,
+        init_strategy: InitStrategy,
    ) -> Self {
        Self {
            name,
@@ -66,6 +121,8 @@ where
            invalidator,
            initializer,
            token_filter,
+            version: Arc::new(AtomicUsize::new(0)),
+            init_strategy,
        }
    }

@@ -75,6 +132,67 @@ where
    }
 }

+impl<K, V, CacheToken> CacheContainer<K, V, CacheToken> {
+    fn inc_version(&self) {
+        self.version.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+async fn init<'a, K, V>(init: Initializer<K, V>, key: K, cache_name: &'a str) -> Result<V>
+where
+    K: Send + Sync + 'a,
+    V: Send + 'a,
+{
+    metrics::CACHE_CONTAINER_CACHE_MISS
+        .with_label_values(&[cache_name])
+        .inc();
+    let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
+        .with_label_values(&[cache_name])
+        .start_timer();
+    init(&key)
+        .await
+        .transpose()
+        .context(error::ValueNotExistSnafu)?
+}
+
+async fn init_with_retry<'a, K, V>(
+    init: Initializer<K, V>,
+    key: K,
+    mut backoff: impl Iterator<Item = Duration> + 'a,
+    version: Arc<AtomicUsize>,
+    cache_name: &'a str,
+) -> Result<V>
+where
+    K: Send + Sync + 'a,
+    V: Send + 'a,
+{
+    let mut attempts = 1usize;
+    loop {
+        let pre_version = version.load(Ordering::Relaxed);
+        metrics::CACHE_CONTAINER_CACHE_MISS
+            .with_label_values(&[cache_name])
+            .inc();
+        let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
+            .with_label_values(&[cache_name])
+            .start_timer();
+        let value = init(&key)
+            .await
+            .transpose()
+            .context(error::ValueNotExistSnafu)??;
+
+        if pre_version == version.load(Ordering::Relaxed) {
+            return Ok(value);
+        }
+
+        if let Some(duration) = backoff.next() {
+            sleep(duration).await;
+            attempts += 1;
+        } else {
+            return error::GetLatestCacheRetryExceededSnafu { attempts }.fail();
+        }
+    }
+}
+
 #[async_trait::async_trait]
 impl<K, V> CacheInvalidator for CacheContainer<K, V, CacheIdent>
 where
@@ -82,14 +200,15 @@ where
    V: Send + Sync,
 {
    async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> {
-        let tasks = caches
+        let idents = caches
            .iter()
            .filter(|token| (self.token_filter)(token))
-            .map(|token| (self.invalidator)(&self.cache, token));
-        join_all(tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();
+        if !idents.is_empty() {
+            self.inc_version();
+            (self.invalidator)(&self.cache, &idents).await?;
+        }
+
        Ok(())
    }
 }
@@ -99,27 +218,39 @@ where
    K: Copy + Hash + Eq + Send + Sync + 'static,
    V: Clone + Send + Sync + 'static,
 {
-    /// Returns a _clone_ of the value corresponding to the key.
+    /// Returns a value from cache for copyable keys.
+    ///
+    /// With [InitStrategy::Unchecked], this method prioritizes latency and may
+    /// return stale/dirty value. With [InitStrategy::VersionChecked], this method
+    /// retries initialization on version change and avoids dirty returns.
    pub async fn get(&self, key: K) -> Result<Option<V>> {
        metrics::CACHE_CONTAINER_CACHE_GET
            .with_label_values(&[&self.name])
            .inc();
-        let moved_init = self.initializer.clone();
-        let moved_key = key;
-        let init = async move {
-            metrics::CACHE_CONTAINER_CACHE_MISS
-                .with_label_values(&[&self.name])
-                .inc();
-            let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
-                .with_label_values(&[&self.name])
-                .start_timer();
-            moved_init(&moved_key)
-                .await
-                .transpose()
-                .context(error::ValueNotExistSnafu)?
+
+        let result = match self.init_strategy {
+            InitStrategy::Unchecked => {
+                self.cache
+                    .try_get_with(key, init(self.initializer.clone(), key, &self.name))
+                    .await
+            }
+            InitStrategy::VersionChecked => {
+                self.cache
+                    .try_get_with(
+                        key,
+                        init_with_retry(
+                            self.initializer.clone(),
+                            key,
+                            latest_get_backoff(),
+                            self.version.clone(),
+                            &self.name,
+                        ),
+                    )
+                    .await
+            }
        };

-        match self.cache.try_get_with(key, init).await {
+        match result {
            Ok(value) => Ok(Some(value)),
            Err(err) => match err.as_ref() {
                Error::ValueNotExist { .. } => Ok(None),
@@ -136,14 +267,15 @@ where
 {
    /// Invalidates cache by [CacheToken].
    pub async fn invalidate(&self, caches: &[CacheToken]) -> Result<()> {
-        let tasks = caches
+        let idents = caches
            .iter()
            .filter(|token| (self.token_filter)(token))
-            .map(|token| (self.invalidator)(&self.cache, token));
-        join_all(tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();
+        if !idents.is_empty() {
+            self.inc_version();
+            (self.invalidator)(&self.cache, &idents).await?;
+        }
+
        Ok(())
    }

@@ -156,7 +288,11 @@ where
        self.cache.contains_key(key)
    }

-    /// Returns a _clone_ of the value corresponding to the key.
+    /// Returns a value from cache by key reference.
+    ///
+    /// With [InitStrategy::Unchecked], this method prioritizes latency and may
+    /// return stale/dirty value. With [InitStrategy::VersionChecked], this method
+    /// retries initialization on version change and avoids dirty returns.
    pub async fn get_by_ref<Q>(&self, key: &Q) -> Result<Option<V>>
    where
        K: Borrow<Q>,
@@ -165,24 +301,32 @@ where
        metrics::CACHE_CONTAINER_CACHE_GET
            .with_label_values(&[&self.name])
            .inc();
-        let moved_init = self.initializer.clone();
-        let moved_key = key.to_owned();
-
-        let init = async move {
-            metrics::CACHE_CONTAINER_CACHE_MISS
-                .with_label_values(&[&self.name])
-                .inc();
-            let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
-                .with_label_values(&[&self.name])
-                .start_timer();
-
-            moved_init(&moved_key)
-                .await
-                .transpose()
-                .context(error::ValueNotExistSnafu)?
+        let result = match self.init_strategy {
+            InitStrategy::Unchecked => {
+                self.cache
+                    .try_get_with_by_ref(
+                        key,
+                        init(self.initializer.clone(), key.to_owned(), &self.name),
+                    )
+                    .await
+            }
+            InitStrategy::VersionChecked => {
+                self.cache
+                    .try_get_with_by_ref(
+                        key,
+                        init_with_retry(
+                            self.initializer.clone(),
+                            key.to_owned(),
+                            latest_get_backoff(),
+                            self.version.clone(),
+                            &self.name,
+                        ),
+                    )
+                    .await
+            }
        };

-        match self.cache.try_get_with_by_ref(key, init).await {
+        match result {
            Ok(value) => Ok(Some(value)),
            Err(err) => match err.as_ref() {
                Error::ValueNotExist { .. } => Ok(None),
@@ -296,9 +440,11 @@ mod tests {
            moved_counter.fetch_add(1, Ordering::Relaxed);
            Box::pin(async { Ok(Some("hi".to_string())) })
        });
-        let invalidator: Invalidator<String, String, String> = Box::new(|cache, key| {
+        let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
            Box::pin(async move {
-                cache.invalidate(key).await;
+                for key in keys {
+                    cache.invalidate(*key).await;
+                }
                Ok(())
            })
        });
@@ -323,4 +469,46 @@ mod tests {
        assert_eq!(value, "hi");
        assert_eq!(counter.load(Ordering::Relaxed), 2);
    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_get_by_ref_returns_fresh_value_after_invalidate() {
+        let cache: Cache<String, String> = CacheBuilder::new(128).build();
+        let counter = Arc::new(AtomicI32::new(0));
+        let moved_counter = counter.clone();
+        let init: Initializer<String, String> = Arc::new(move |_| {
+            let counter = moved_counter.clone();
+            Box::pin(async move {
+                let n = counter.fetch_add(1, Ordering::Relaxed) + 1;
+                sleep(Duration::from_millis(100)).await;
+                Ok(Some(format!("v{n}")))
+            })
+        });
+        let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
+            Box::pin(async move {
+                for key in keys {
+                    cache.invalidate(*key).await;
+                }
+                Ok(())
+            })
+        });
+
+        let adv_cache = Arc::new(CacheContainer::with_strategy(
+            "test".to_string(),
+            cache,
+            invalidator,
+            init,
+            always_true_filter,
+            InitStrategy::VersionChecked,
+        ));
+
+        let moved_cache = adv_cache.clone();
+        let get_task = tokio::spawn(async move { moved_cache.get_by_ref("foo").await });
+
+        sleep(Duration::from_millis(50)).await;
+        adv_cache.invalidate(&["foo".to_string()]).await.unwrap();
+
+        let value = get_task.await.unwrap().unwrap().unwrap();
+        assert_eq!(value, "v2");
+        assert_eq!(counter.load(Ordering::Relaxed), 2);
+    }
 }
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -170,20 +170,22 @@ async fn handle_drop_flow(

 fn invalidator<'a>(
    cache: &'a Cache<TableId, FlownodeFlowSet>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
    Box::pin(async move {
-        match ident {
-            CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
-            CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
-            CacheIdent::FlowNodeAddressChange(node_id) => {
-                info!(
-                    "Invalidate flow node cache for node_id in table_flownode: {}",
-                    node_id
-                );
-                cache.invalidate_all();
+        for ident in idents {
+            match ident {
+                CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
+                CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
+                CacheIdent::FlowNodeAddressChange(node_id) => {
+                    info!(
+                        "Invalidate flow node cache for node_id in table_flownode: {}",
+                        node_id
+                    );
+                    cache.invalidate_all();
+                }
+                _ => {}
            }
-            _ => {}
        }
        Ok(())
    })
--- a/src/common/meta/src/cache/table/schema.rs
+++ b/src/common/meta/src/cache/table/schema.rs
@@ -58,11 +58,13 @@ fn init_factory(schema_manager: SchemaManager) -> Initializer<SchemaName, Arc<Sc

 fn invalidator<'a>(
    cache: &'a Cache<SchemaName, Arc<SchemaNameValue>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, crate::error::Result<()>> {
    Box::pin(async move {
-        if let CacheIdent::SchemaName(schema_name) = ident {
-            cache.invalidate(schema_name).await
+        for ident in idents {
+            if let CacheIdent::SchemaName(schema_name) = ident {
+                cache.invalidate(schema_name).await
+            }
        }
        Ok(())
    })
--- a/src/common/meta/src/cache/table/table_info.rs
+++ b/src/common/meta/src/cache/table/table_info.rs
@@ -61,11 +61,13 @@ fn init_factory(table_info_manager: TableInfoManagerRef) -> Initializer<TableId,

 fn invalidator<'a>(
    cache: &'a Cache<TableId, Arc<TableInfo>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
    Box::pin(async move {
-        if let CacheIdent::TableId(table_id) = ident {
-            cache.invalidate(table_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(table_id) = ident {
+                cache.invalidate(table_id).await
+            }
        }
        Ok(())
    })
--- a/src/common/meta/src/cache/table/table_name.rs
+++ b/src/common/meta/src/cache/table/table_name.rs
@@ -71,11 +71,13 @@ fn init_factory(table_name_manager: TableNameManagerRef) -> Initializer<TableNam

 fn invalidator<'a>(
    cache: &'a Cache<TableName, TableId>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
    Box::pin(async move {
-        if let CacheIdent::TableName(table_name) = ident {
-            cache.invalidate(table_name).await
+        for ident in idents {
+            if let CacheIdent::TableName(table_name) = ident {
+                cache.invalidate(table_name).await
+            }
        }
        Ok(())
    })
--- a/src/common/meta/src/cache/table/table_route.rs
+++ b/src/common/meta/src/cache/table/table_route.rs
@@ -19,6 +19,7 @@ use moka::future::Cache;
 use snafu::OptionExt;
 use store_api::storage::TableId;

+use crate::cache::container::InitStrategy;
 use crate::cache::{CacheContainer, Initializer};
 use crate::error;
 use crate::error::Result;
@@ -65,7 +66,14 @@ pub fn new_table_route_cache(
    let table_info_manager = Arc::new(TableRouteManager::new(kv_backend));
    let init = init_factory(table_info_manager);

-    CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
+    CacheContainer::with_strategy(
+        name,
+        cache,
+        Box::new(invalidator),
+        init,
+        filter,
+        InitStrategy::VersionChecked,
+    )
 }

 fn init_factory(
@@ -92,11 +100,13 @@ fn init_factory(

 fn invalidator<'a>(
    cache: &'a Cache<TableId, Arc<TableRoute>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
    Box::pin(async move {
-        if let CacheIdent::TableId(table_id) = ident {
-            cache.invalidate(table_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(table_id) = ident {
+                cache.invalidate(table_id).await
+            }
        }
        Ok(())
    })
--- a/src/common/meta/src/cache/table/table_schema.rs
+++ b/src/common/meta/src/cache/table/table_schema.rs
@@ -65,7 +65,7 @@ fn init_factory(table_info_manager: TableInfoManager) -> Initializer<TableId, Ar
 /// Never invalidates table id schema cache.
 fn invalidator<'a>(
    _cache: &'a Cache<TableId, Arc<SchemaName>>,
-    _ident: &'a CacheIdent,
+    _idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, error::Result<()>> {
    Box::pin(std::future::ready(Ok(())))
 }
--- a/src/common/meta/src/cache/table/view_info.rs
+++ b/src/common/meta/src/cache/table/view_info.rs
@@ -60,11 +60,13 @@ fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer<TableId, A

 fn invalidator<'a>(
    cache: &'a Cache<TableId, Arc<ViewInfoValue>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
    Box::pin(async move {
-        if let CacheIdent::TableId(view_id) = ident {
-            cache.invalidate(view_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(view_id) = ident {
+                cache.invalidate(view_id).await
+            }
        }
        Ok(())
    })
--- a/src/common/meta/src/election.rs
+++ b/src/common/meta/src/election.rs
@@ -21,15 +21,85 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};

 use common_telemetry::{error, info, warn};
+use serde::{Deserialize, Serialize};
 use tokio::sync::broadcast::error::RecvError;
 use tokio::sync::broadcast::{self, Receiver, Sender};

 use crate::error::Result;
-use crate::metasrv::MetasrvNodeInfo;

-pub(crate) const CANDIDATE_LEASE_SECS: u64 = 600;
+pub const CANDIDATE_LEASE_SECS: u64 = 600;
 const KEEP_ALIVE_INTERVAL_SECS: u64 = CANDIDATE_LEASE_SECS / 2;

+/// The value of the leader. It is used to store the leader's address.
+pub struct LeaderValue(pub String);
+
+impl<T: AsRef<[u8]>> From<T> for LeaderValue {
+    fn from(value: T) -> Self {
+        let string = String::from_utf8_lossy(value.as_ref());
+        Self(string.to_string())
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetasrvNodeInfo {
+    // The metasrv's address
+    pub addr: String,
+    // The node build version
+    pub version: String,
+    // The node build git commit hash
+    pub git_commit: String,
+    // The node start timestamp in milliseconds
+    pub start_time_ms: u64,
+    // The node total cpu millicores
+    #[serde(default)]
+    pub total_cpu_millicores: i64,
+    // The node total memory bytes
+    #[serde(default)]
+    pub total_memory_bytes: i64,
+    /// The node build cpu usage millicores
+    #[serde(default)]
+    pub cpu_usage_millicores: i64,
+    /// The node build memory usage bytes
+    #[serde(default)]
+    pub memory_usage_bytes: i64,
+    // The node hostname
+    #[serde(default)]
+    pub hostname: String,
+}
+
+// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
+#[allow(deprecated)]
+impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
+    fn from(node_info: MetasrvNodeInfo) -> Self {
+        Self {
+            peer: Some(api::v1::meta::Peer {
+                addr: node_info.addr,
+                ..Default::default()
+            }),
+            // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
+            // New code should use the fields in `info.NodeInfo` instead.
+            version: node_info.version.clone(),
+            git_commit: node_info.git_commit.clone(),
+            start_time_ms: node_info.start_time_ms,
+            cpus: node_info.total_cpu_millicores as u32,
+            memory_bytes: node_info.total_memory_bytes as u64,
+            // The canonical location for node information.
+            info: Some(api::v1::meta::NodeInfo {
+                version: node_info.version,
+                git_commit: node_info.git_commit,
+                start_time_ms: node_info.start_time_ms,
+                total_cpu_millicores: node_info.total_cpu_millicores,
+                total_memory_bytes: node_info.total_memory_bytes,
+                cpu_usage_millicores: node_info.cpu_usage_millicores,
+                memory_usage_bytes: node_info.memory_usage_bytes,
+                cpus: node_info.total_cpu_millicores as u32,
+                memory_bytes: node_info.total_memory_bytes as u64,
+                hostname: node_info.hostname,
+            }),
+        }
+    }
+}
+
 /// Messages sent when the leader changes.
 #[derive(Debug, Clone)]
 pub enum LeaderChangeMessage {
@@ -168,3 +238,5 @@ pub trait Election: Send + Sync {

    fn subscribe_leader_change(&self) -> Receiver<LeaderChangeMessage>;
 }
+
+pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
--- a/src/common/meta/src/election/etcd.rs
+++ b/src/common/meta/src/election/etcd.rs
@@ -16,8 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;

-use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use etcd_client::{
    Client, GetOptions, LeaderKey as EtcdLeaderKey, LeaseKeepAliveStream, LeaseKeeper, PutOptions,
@@ -27,13 +25,15 @@ use tokio::sync::broadcast;
 use tokio::sync::broadcast::Receiver;
 use tokio::time::{MissedTickBehavior, timeout};

+use crate::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
 use crate::election::{
-    CANDIDATE_LEASE_SECS, Election, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, LeaderKey,
-    listen_leader_change, send_leader_change_and_set_flags,
+    CANDIDATE_LEASE_SECS, Election, ElectionRef, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage,
+    LeaderKey, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error;
 use crate::error::Result;
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};

 impl LeaderKey for EtcdLeaderKey {
    fn name(&self) -> &[u8] {
@@ -253,7 +253,7 @@ impl Election for EtcdElection {
                .leader(self.election_key())
                .await
                .context(error::EtcdFailedSnafu)?;
-            let leader_value = res.kv().context(error::NoLeaderSnafu)?.value();
+            let leader_value = res.kv().context(error::ElectionNoLeaderSnafu)?.value();
            Ok(leader_value.into())
        }
    }
@@ -279,7 +279,7 @@ impl EtcdElection {
            ensure!(
                res.ttl() > 0,
                error::UnexpectedSnafu {
-                    violated: "Failed to refresh the lease",
+                    err_msg: "Failed to refresh the lease".to_string(),
                }
            );

--- a/src/common/meta/src/election/rds.rs
+++ b/src/common/meta/src/election/rds.rs
@@ -36,7 +36,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
            .split(LEASE_SEP)
            .collect_tuple()
            .with_context(|| UnexpectedSnafu {
-                violated: format!(
+                err_msg: format!(
                    "Invalid value {}, expect node info || {} || expire time",
                    value, LEASE_SEP
                ),
@@ -45,7 +45,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
    let expire_time = match Timestamp::from_str(expire_time, None) {
        Ok(ts) => ts,
        Err(_) => UnexpectedSnafu {
-            violated: format!("Invalid timestamp: {}", expire_time),
+            err_msg: format!("Invalid timestamp: {}", expire_time),
        }
        .fail()?,
    };
--- a/src/common/meta/src/election/rds/mysql.rs
+++ b/src/common/meta/src/election/rds/mysql.rs
@@ -16,7 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;

-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use common_time::Timestamp;
 use snafu::{OptionExt, ResultExt, ensure};
@@ -29,14 +28,15 @@ use tokio::time::MissedTickBehavior;

 use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
 use crate::election::{
-    Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
+    Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error::{
    AcquireMySqlClientSnafu, DecodeSqlValueSnafu, DeserializeFromJsonSnafu,
-    LeaderLeaseChangedSnafu, LeaderLeaseExpiredSnafu, MySqlExecutionSnafu, NoLeaderSnafu, Result,
-    SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
+    ElectionLeaderLeaseChangedSnafu, ElectionLeaderLeaseExpiredSnafu, ElectionNoLeaderSnafu,
+    MySqlExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
 };
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};

 struct ElectionSqlFactory<'a> {
    table_name: &'a str,
@@ -592,7 +592,7 @@ impl Election for MySqlElection {
            ensure!(
                lease.expire_time > lease.current,
                UnexpectedSnafu {
-                    violated: format!(
+                    err_msg: format!(
                        "Candidate lease expired at {:?} (current time: {:?}), key: {:?}",
                        lease.expire_time,
                        lease.current,
@@ -667,10 +667,10 @@ impl Election for MySqlElection {
            let client = self.client.lock().await;
            let mut executor = Executor::Default(client);
            if let Some(lease) = self.get_value_with_lease(&key, &mut executor).await? {
-                ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+                ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
                Ok(lease.leader_value.as_bytes().into())
            } else {
-                NoLeaderSnafu.fail()
+                ElectionNoLeaderSnafu.fail()
            }
        }
    }
@@ -705,7 +705,7 @@ impl MySqlElection {
        let current_time = match Timestamp::from_str(&current_time_str, None) {
            Ok(ts) => ts,
            Err(_) => UnexpectedSnafu {
-                violated: format!("Invalid timestamp: {}", current_time_str),
+                err_msg: format!("Invalid timestamp: {}", current_time_str),
            }
            .fail()?,
        };
@@ -740,7 +740,7 @@ impl MySqlElection {
            current = match Timestamp::from_str(current_time_str, None) {
                Ok(ts) => ts,
                Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                }
                .fail()?,
            };
@@ -777,7 +777,7 @@ impl MySqlElection {
        ensure!(
            res == 1,
            UnexpectedSnafu {
-                violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
+                err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
            }
        );

@@ -920,9 +920,12 @@ impl MySqlElection {
    ///   will be released.
    /// - **Case 2**: If all checks pass, the function returns without performing any actions.
    fn lease_check(&self, lease: &Option<Lease>) -> Result<Lease> {
-        let lease = lease.as_ref().context(NoLeaderSnafu)?;
+        let lease = lease.as_ref().context(ElectionNoLeaderSnafu)?;
        // Case 1: Lease expired
-        ensure!(lease.expire_time > lease.current, LeaderLeaseExpiredSnafu);
+        ensure!(
+            lease.expire_time > lease.current,
+            ElectionLeaderLeaseExpiredSnafu
+        );
        // Case 2: Everything is fine
        Ok(lease.clone())
    }
@@ -960,7 +963,7 @@ impl MySqlElection {
        let remote_lease = self.get_value_with_lease(&key, &mut executor).await?;
        ensure!(
            expected_lease.map(|lease| lease.origin) == remote_lease.map(|lease| lease.origin),
-            LeaderLeaseChangedSnafu
+            ElectionLeaderLeaseChangedSnafu
        );
        self.delete_value(&key, &mut executor).await?;
        self.put_value_with_lease(
@@ -986,12 +989,11 @@ impl MySqlElection {
 mod tests {
    use std::{assert_matches, env};

-    use common_meta::maybe_skip_mysql_integration_test;
    use common_telemetry::init_default_ut_logging;
+    use sqlx::MySqlPool;

    use super::*;
-    use crate::error;
-    use crate::utils::mysql::create_mysql_pool;
+    use crate::{error, maybe_skip_mysql_integration_test};

    async fn create_mysql_client(
        table_name: Option<&str>,
@@ -1002,11 +1004,11 @@ mod tests {
        let endpoint = env::var("GT_MYSQL_ENDPOINTS").unwrap_or_default();
        if endpoint.is_empty() {
            return UnexpectedSnafu {
-                violated: "MySQL endpoint is empty".to_string(),
+                err_msg: "MySQL endpoint is empty".to_string(),
            }
            .fail();
        }
-        let pool = create_mysql_pool(&[endpoint], None).await.unwrap();
+        let pool = MySqlPool::connect(&endpoint).await.unwrap();
        let mut client = ElectionMysqlClient::new(
            pool,
            execution_timeout,
@@ -1301,7 +1303,7 @@ mod tests {
        let err = elected(&leader_mysql_election, table_name, Some(incorrect_lease))
            .await
            .unwrap_err();
-        assert_matches!(err, error::Error::LeaderLeaseChanged { .. });
+        assert_matches!(err, error::Error::ElectionLeaderLeaseChanged { .. });
        let lease = get_lease(&leader_mysql_election).await;
        assert!(lease.is_none());
        drop_table(&leader_mysql_election.client, table_name).await;
--- a/src/common/meta/src/election/rds/postgres.rs
+++ b/src/common/meta/src/election/rds/postgres.rs
@@ -16,7 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;

-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use common_time::Timestamp;
 use deadpool_postgres::{Manager, Pool};
@@ -28,13 +27,15 @@ use tokio_postgres::types::ToSql;

 use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
 use crate::election::{
-    Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
+    Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error::{
-    DeserializeFromJsonSnafu, GetPostgresClientSnafu, NoLeaderSnafu, PostgresExecutionSnafu,
-    Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
+    DeserializeFromJsonSnafu, ElectionNoLeaderSnafu, GetPostgresClientSnafu,
+    PostgresExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu,
+    UnexpectedSnafu,
 };
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};

 struct ElectionSqlFactory<'a> {
    lock_id: u64,
@@ -404,13 +405,13 @@ impl Election for PgElection {
                .get_value_with_lease(&key)
                .await?
                .context(UnexpectedSnafu {
-                    violated: format!("Failed to get lease for key: {:?}", key),
+                    err_msg: format!("Failed to get lease for key: {:?}", key),
                })?;

            ensure!(
                lease.expire_time > lease.current,
                UnexpectedSnafu {
-                    violated: format!(
+                    err_msg: format!(
                        "Candidate lease expired at {:?} (current time {:?}), key: {:?}",
                        lease.expire_time, lease.current, key
                    ),
@@ -464,11 +465,11 @@ impl Election for PgElection {
                .query(&self.sql_set.campaign, &[])
                .await?;
            let row = res.first().context(UnexpectedSnafu {
-                violated: "Failed to get the result of acquiring advisory lock",
+                err_msg: "Failed to get the result of acquiring advisory lock".to_string(),
            })?;
            let is_leader = row.try_get(0).map_err(|_| {
                UnexpectedSnafu {
-                    violated: "Failed to get the result of get lock",
+                    err_msg: "Failed to get the result of get lock".to_string(),
                }
                .build()
            })?;
@@ -500,10 +501,10 @@ impl Election for PgElection {
        } else {
            let key = self.election_key();
            if let Some(lease) = self.get_value_with_lease(&key).await? {
-                ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+                ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
                Ok(lease.leader_value.as_bytes().into())
            } else {
-                NoLeaderSnafu.fail()
+                ElectionNoLeaderSnafu.fail()
            }
        }
    }
@@ -537,7 +538,7 @@ impl PgElection {
            let current_time = match Timestamp::from_str(current_time_str, None) {
                Ok(ts) => ts,
                Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                }
                .fail()?,
            };
@@ -576,7 +577,7 @@ impl PgElection {
            current = match Timestamp::from_str(current_time_str, None) {
                Ok(ts) => ts,
                Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                }
                .fail()?,
            };
@@ -613,7 +614,7 @@ impl PgElection {
        ensure!(
            res == 1,
            UnexpectedSnafu {
-                violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
+                err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
            }
        );

@@ -742,9 +743,9 @@ impl PgElection {
        let lease = self
            .get_value_with_lease(&key)
            .await?
-            .context(NoLeaderSnafu)?;
+            .context(ElectionNoLeaderSnafu)?;
        // Case 2
-        ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+        ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
        // Case 3
        Ok(())
    }
@@ -830,11 +831,11 @@ impl PgElection {
 mod tests {
    use std::{assert_matches, env};

-    use common_meta::maybe_skip_postgres_integration_test;
+    use deadpool_postgres::{Config, Runtime};
+    use tokio_postgres::NoTls;

    use super::*;
-    use crate::error;
-    use crate::utils::postgres::create_postgres_pool;
+    use crate::{error, maybe_skip_postgres_integration_test};

    async fn create_postgres_client(
        table_name: Option<&str>,
@@ -845,11 +846,13 @@ mod tests {
        let endpoint = env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default();
        if endpoint.is_empty() {
            return UnexpectedSnafu {
-                violated: "Postgres endpoint is empty".to_string(),
+                err_msg: "Postgres endpoint is empty".to_string(),
            }
            .fail();
        }
-        let pool = create_postgres_pool(&[endpoint], None, None).await.unwrap();
+        let mut cfg = Config::new();
+        cfg.url = Some(endpoint);
+        let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls).unwrap();
        let mut pg_client = ElectionPgClient::new(
            pool,
            execution_timeout,
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -338,6 +338,24 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Metasrv election has no leader at this moment"))]
+    ElectionNoLeader {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Metasrv election leader lease expired"))]
+    ElectionLeaderLeaseExpired {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Metasrv election leader lease changed during election"))]
+    ElectionLeaderLeaseChanged {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Table already exists, table: {}", table_name))]
    TableAlreadyExists {
        table_name: String,
@@ -714,6 +732,16 @@ pub enum Error {
    #[snafu(display("Failed to get cache"))]
    GetCache { source: Arc<Error> },

+    #[snafu(display(
+        "Failed to get latest cache value after {} attempts due to concurrent invalidation",
+        attempts
+    ))]
+    GetLatestCacheRetryExceeded {
+        attempts: usize,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[cfg(feature = "pg_kvbackend")]
    #[snafu(display("Failed to execute via Postgres, sql: {}", sql))]
    PostgresExecution {
@@ -741,6 +769,15 @@ pub enum Error {
        location: Location,
    },

+    #[cfg(feature = "pg_kvbackend")]
+    #[snafu(display("Failed to get Postgres client"))]
+    GetPostgresClient {
+        #[snafu(source)]
+        error: deadpool::managed::PoolError<tokio_postgres::Error>,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[cfg(feature = "pg_kvbackend")]
    #[snafu(display("Failed to {} Postgres transaction", operation))]
    PostgresTransaction {
@@ -795,6 +832,24 @@ pub enum Error {
        location: Location,
    },

+    #[cfg(feature = "mysql_kvbackend")]
+    #[snafu(display("Failed to decode sql value"))]
+    DecodeSqlValue {
+        #[snafu(source)]
+        error: sqlx::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[cfg(feature = "mysql_kvbackend")]
+    #[snafu(display("Failed to acquire mysql client from pool"))]
+    AcquireMySqlClient {
+        #[snafu(source)]
+        error: sqlx::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[cfg(feature = "mysql_kvbackend")]
    #[snafu(display("Failed to {} MySql transaction", operation))]
    MySqlTransaction {
@@ -812,6 +867,15 @@ pub enum Error {
        location: Location,
    },

+    #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
+    #[snafu(display("Sql execution timeout, sql: {}, duration: {:?}", sql, duration))]
+    SqlExecutionTimeout {
+        sql: String,
+        duration: std::time::Duration,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display(
        "Datanode table info not found, table id: {}, datanode id: {}",
        table_id,
@@ -1063,8 +1127,12 @@ impl ErrorExt for Error {
            | ConnectEtcd { .. }
            | MoveValues { .. }
            | GetCache { .. }
+            | GetLatestCacheRetryExceeded { .. }
            | SerializeToJson { .. }
-            | DeserializeFromJson { .. } => StatusCode::Internal,
+            | DeserializeFromJson { .. }
+            | ElectionNoLeader { .. }
+            | ElectionLeaderLeaseExpired { .. }
+            | ElectionLeaderLeaseChanged { .. } => StatusCode::Internal,

            NoLeader { .. } => StatusCode::TableUnavailable,
            ValueNotExist { .. }
@@ -1187,15 +1255,18 @@ impl ErrorExt for Error {
            PostgresExecution { .. }
            | CreatePostgresPool { .. }
            | GetPostgresConnection { .. }
+            | GetPostgresClient { .. }
            | PostgresTransaction { .. }
            | PostgresTlsConfig { .. }
            | InvalidTlsConfig { .. } => StatusCode::Internal,
            #[cfg(feature = "mysql_kvbackend")]
-            MySqlExecution { .. } | CreateMySqlPool { .. } | MySqlTransaction { .. } => {
-                StatusCode::Internal
-            }
+            MySqlExecution { .. }
+            | CreateMySqlPool { .. }
+            | DecodeSqlValue { .. }
+            | AcquireMySqlClient { .. }
+            | MySqlTransaction { .. } => StatusCode::Internal,
            #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
-            RdsTransactionRetryFailed { .. } => StatusCode::Internal,
+            RdsTransactionRetryFailed { .. } | SqlExecutionTimeout { .. } => StatusCode::Internal,
            DatanodeTableInfoNotFound { .. } => StatusCode::Internal,
        }
    }
@@ -1243,7 +1314,10 @@ impl Error {

    /// Determine whether it is a retry later type through [StatusCode]
    pub fn is_retry_later(&self) -> bool {
-        matches!(self, Error::RetryLater { .. })
+        matches!(
+            self,
+            Error::RetryLater { .. } | Error::GetLatestCacheRetryExceeded { .. }
+        )
    }

    /// Determine whether it needs to clean poisons.
--- a/src/common/meta/src/lib.rs
+++ b/src/common/meta/src/lib.rs
@@ -19,6 +19,7 @@ pub mod datanode;
 pub mod ddl;
 pub mod ddl_manager;
 pub mod distributed_time_constants;
+pub mod election;
 pub mod error;
 pub mod flow_name;
 pub mod heartbeat;
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -27,7 +27,6 @@ use api::v1::{
 use async_stream::try_stream;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
-use common_base::AffectedRows;
 use common_error::ext::BoxedError;
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
@@ -260,62 +259,6 @@ impl GrpcQueryHandler for Instance {
            .context(server_error::ExecuteGrpcQuerySnafu)
    }

-    async fn put_record_batch(
-        &self,
-        request: servers::grpc::flight::PutRecordBatchRequest,
-        table_ref: &mut Option<TableRef>,
-        ctx: QueryContextRef,
-    ) -> server_error::Result<AffectedRows> {
-        let result: Result<AffectedRows> = async {
-            let table = if let Some(table) = table_ref {
-                table.clone()
-            } else {
-                let table = self
-                    .catalog_manager()
-                    .table(
-                        &request.table_name.catalog_name,
-                        &request.table_name.schema_name,
-                        &request.table_name.table_name,
-                        None,
-                    )
-                    .await
-                    .context(CatalogSnafu)?
-                    .with_context(|| TableNotFoundSnafu {
-                        table_name: request.table_name.to_string(),
-                    })?;
-                *table_ref = Some(table.clone());
-                table
-            };
-
-            let interceptor_ref = self.plugins.get::<GrpcQueryInterceptorRef<Error>>();
-            let interceptor = interceptor_ref.as_ref();
-            interceptor.pre_bulk_insert(table.clone(), ctx.clone())?;
-
-            self.plugins
-                .get::<PermissionCheckerRef>()
-                .as_ref()
-                .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
-                .context(PermissionSnafu)?;
-
-            // do we check limit for bulk insert?
-
-            self.inserter
-                .handle_bulk_insert(
-                    table,
-                    request.flight_data,
-                    request.record_batch,
-                    request.schema_bytes,
-                )
-                .await
-                .context(TableOperationSnafu)
-        }
-        .await;
-
-        result
-            .map_err(BoxedError::new)
-            .context(server_error::ExecuteGrpcRequestSnafu)
-    }
-
    fn handle_put_record_batch_stream(
        &self,
        stream: servers::grpc::flight::PutRecordBatchRequestStream,
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -24,6 +24,8 @@ use common_base::Plugins;
 use common_config::Configurable;
 #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
 use common_meta::distributed_time_constants::META_LEASE_SECS;
+use common_meta::election::CANDIDATE_LEASE_SECS;
+use common_meta::election::etcd::EtcdElection;
 use common_meta::kv_backend::chroot::ChrootKvBackend;
 use common_meta::kv_backend::etcd::EtcdStore;
 use common_meta::kv_backend::memory::MemoryKvBackend;
@@ -42,9 +44,6 @@ use tonic::codec::CompressionEncoding;
 use tonic::transport::server::{Router, TcpIncoming};

 use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef};
-#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
-use crate::election::CANDIDATE_LEASE_SECS;
-use crate::election::etcd::EtcdElection;
 use crate::error::OtherSnafu;
 use crate::metasrv::builder::MetasrvBuilder;
 use crate::metasrv::{
@@ -281,7 +280,8 @@ pub async fn metasrv_builder(
                etcd_client,
                opts.store_key_prefix.clone(),
            )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;

            (kv_backend, Some(election))
        }
@@ -290,10 +290,10 @@ pub async fn metasrv_builder(
            use std::time::Duration;

            use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS;
+            use common_meta::election::rds::postgres::{ElectionPgClient, PgElection};
            use common_meta::kv_backend::rds::PgStore;
            use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod};

-            use crate::election::rds::postgres::{ElectionPgClient, PgElection};
            use crate::utils::postgres::create_postgres_pool;

            let candidate_lease_ttl = Duration::from_secs(CANDIDATE_LEASE_SECS);
@@ -321,7 +321,8 @@ pub async fn metasrv_builder(
                execution_timeout,
                idle_session_timeout,
                statement_timeout,
-            )?;
+            )
+            .context(error::KvBackendSnafu)?;
            let election = PgElection::with_pg_client(
                opts.grpc.server_addr.clone(),
                election_client,
@@ -332,7 +333,8 @@ pub async fn metasrv_builder(
                &opts.meta_table_name,
                opts.meta_election_lock_id,
            )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;

            let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
                .await?;
@@ -352,9 +354,9 @@ pub async fn metasrv_builder(
        (None, BackendImpl::MysqlStore) => {
            use std::time::Duration;

+            use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
            use common_meta::kv_backend::rds::MySqlStore;

-            use crate::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
            use crate::utils::mysql::create_mysql_pool;

            let pool = create_mysql_pool(&opts.store_addrs, opts.backend_tls.as_ref()).await?;
@@ -389,7 +391,8 @@ pub async fn metasrv_builder(
                meta_lease_ttl,
                &election_table_name,
            )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;
            (kv_backend, Some(election))
        }
    };
--- a/src/meta-srv/src/cluster.rs
+++ b/src/meta-srv/src/cluster.rs
@@ -247,7 +247,7 @@ impl MetaPeerClient {
        // Safety: when self.is_leader() == false, election must not empty.
        let election = self.election.as_ref().unwrap();

-        let leader_addr = election.leader().await?.0;
+        let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;

        let channel = self
            .channel_manager
@@ -279,7 +279,7 @@ impl MetaPeerClient {
        // Safety: when self.is_leader() == false, election must not empty.
        let election = self.election.as_ref().unwrap();

-        let leader_addr = election.leader().await?.0;
+        let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;

        let channel = self
            .channel_manager
--- a/src/meta-srv/src/lib.rs
+++ b/src/meta-srv/src/lib.rs
@@ -19,7 +19,6 @@ pub mod bootstrap;
 pub mod cache_invalidator;
 pub mod cluster;
 pub mod discovery;
-pub mod election;
 pub mod error;
 pub mod events;
 mod failure_detector;
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -32,6 +32,8 @@ use common_meta::ddl_manager::DdlManagerRef;
 use common_meta::distributed_time_constants::{
    self, BASE_HEARTBEAT_INTERVAL, default_distributed_time_constants, frontend_heartbeat_interval,
 };
+use common_meta::election::LeaderChangeMessage;
+pub use common_meta::election::{ElectionRef, MetasrvNodeInfo};
 use common_meta::key::TableMetadataManagerRef;
 use common_meta::key::runtime_switch::RuntimeSwitchManagerRef;
 use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef};
@@ -64,7 +66,6 @@ use tokio::sync::broadcast::error::RecvError;

 use crate::cluster::MetaPeerClientRef;
 use crate::discovery;
-use crate::election::{Election, LeaderChangeMessage};
 use crate::error::{
    self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu,
    StartTelemetryTaskSnafu, StopProcedureManagerSnafu,
@@ -459,76 +460,6 @@ impl Context {
    }
 }

-/// The value of the leader. It is used to store the leader's address.
-pub struct LeaderValue(pub String);
-
-impl<T: AsRef<[u8]>> From<T> for LeaderValue {
-    fn from(value: T) -> Self {
-        let string = String::from_utf8_lossy(value.as_ref());
-        Self(string.to_string())
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MetasrvNodeInfo {
-    // The metasrv's address
-    pub addr: String,
-    // The node build version
-    pub version: String,
-    // The node build git commit hash
-    pub git_commit: String,
-    // The node start timestamp in milliseconds
-    pub start_time_ms: u64,
-    // The node total cpu millicores
-    #[serde(default)]
-    pub total_cpu_millicores: i64,
-    // The node total memory bytes
-    #[serde(default)]
-    pub total_memory_bytes: i64,
-    /// The node build cpu usage millicores
-    #[serde(default)]
-    pub cpu_usage_millicores: i64,
-    /// The node build memory usage bytes
-    #[serde(default)]
-    pub memory_usage_bytes: i64,
-    // The node hostname
-    #[serde(default)]
-    pub hostname: String,
-}
-
-// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
-#[allow(deprecated)]
-impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
-    fn from(node_info: MetasrvNodeInfo) -> Self {
-        Self {
-            peer: Some(api::v1::meta::Peer {
-                addr: node_info.addr,
-                ..Default::default()
-            }),
-            // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
-            // New code should use the fields in `info.NodeInfo` instead.
-            version: node_info.version.clone(),
-            git_commit: node_info.git_commit.clone(),
-            start_time_ms: node_info.start_time_ms,
-            cpus: node_info.total_cpu_millicores as u32,
-            memory_bytes: node_info.total_memory_bytes as u64,
-            // The canonical location for node information.
-            info: Some(api::v1::meta::NodeInfo {
-                version: node_info.version,
-                git_commit: node_info.git_commit,
-                start_time_ms: node_info.start_time_ms,
-                total_cpu_millicores: node_info.total_cpu_millicores,
-                total_memory_bytes: node_info.total_memory_bytes,
-                cpu_usage_millicores: node_info.cpu_usage_millicores,
-                memory_usage_bytes: node_info.memory_usage_bytes,
-                cpus: node_info.total_cpu_millicores as u32,
-                memory_bytes: node_info.total_memory_bytes as u64,
-                hostname: node_info.hostname,
-            }),
-        }
-    }
-}
-
 #[derive(Clone, Copy)]
 pub enum SelectTarget {
    Datanode,
@@ -552,7 +483,6 @@ pub struct SelectorContext {
 pub type SelectorRef = Arc<dyn Selector<Context = SelectorContext, Output = Vec<Peer>>>;
 pub type RegionStatAwareSelectorRef =
    Arc<dyn RegionStatAwareSelector<Context = SelectorContext, Output = Vec<(RegionId, Peer)>>>;
-pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;

 pub struct MetaStateHandler {
    subscribe_manager: Option<SubscriptionManagerRef>,
--- a/src/meta-srv/src/service/admin/leader.rs
+++ b/src/meta-srv/src/service/admin/leader.rs
@@ -32,7 +32,7 @@ pub struct LeaderHandler {
 impl LeaderHandler {
    async fn get_leader(&self) -> Result<Option<String>> {
        if let Some(election) = &self.election {
-            let leader_addr = election.leader().await?.0;
+            let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
            return Ok(Some(leader_addr));
        }
        Ok(None)
--- a/src/meta-srv/src/service/cluster.rs
+++ b/src/meta-srv/src/service/cluster.rs
@@ -63,7 +63,10 @@ impl cluster_server::Cluster for Metasrv {
        let leader_addr = &self.options().grpc.server_addr;
        let (leader, followers) = match self.election() {
            Some(election) => {
-                let nodes = election.all_candidates().await?;
+                let nodes = election
+                    .all_candidates()
+                    .await
+                    .context(error::KvBackendSnafu)?;
                let followers = nodes
                    .into_iter()
                    .filter(|node_info| &node_info.addr != leader_addr)
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -23,7 +23,7 @@ use api::v1::meta::{
 use common_telemetry::{debug, error, info, warn};
 use futures::StreamExt;
 use once_cell::sync::OnceCell;
-use snafu::OptionExt;
+use snafu::{OptionExt, ResultExt};
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Sender;
 use tokio_stream::wrappers::ReceiverStream;
@@ -148,7 +148,7 @@ async fn handle_ask_leader(_req: AskLeaderRequest, ctx: Context) -> Result<AskLe
            if election.is_leader() {
                ctx.server_addr
            } else {
-                election.leader().await?.0
+                election.leader().await.context(error::KvBackendSnafu)?.0
            }
        }
        None => ctx.server_addr,
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -108,6 +108,11 @@ name = "memtable_bench"
 harness = false
 required-features = ["test"]

+[[bench]]
+name = "bench_cache_stream"
+harness = false
+required-features = ["test"]
+
 [[bench]]
 name = "bench_filter_time_partition"
 harness = false
--- a/src/mito2/benches/bench_cache_stream.rs
+++ b/src/mito2/benches/bench_cache_stream.rs
@@ -0,0 +1,126 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Benchmarks for `cache_flat_range_stream` overhead.
+//!
+//! Compares consuming batches from a plain stream vs through the caching wrapper
+//! that clones batches for the range cache.
+//!
+//! Run with:
+//! ```sh
+//! cargo bench -p mito2 --features test --bench bench_cache_stream
+//! ```
+
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use futures::TryStreamExt;
+use mito_codec::row_converter::DensePrimaryKeyCodec;
+use mito2::memtable::bulk::context::BulkIterContext;
+use mito2::memtable::bulk::part::{BulkPartConverter, BulkPartEncoder};
+use mito2::memtable::bulk::part_reader::EncodedBulkPartIter;
+use mito2::read::range_cache::bench_cache_flat_range_stream;
+use mito2::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
+use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
+use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
+
+fn cache_flat_range_stream_bench(c: &mut Criterion) {
+    let metadata = Arc::new(cpu_metadata());
+    let region_id = metadata.region_id;
+    let start_sec = 1710043200;
+    // 2000 hosts × 51 steps = 102,000 rows ≈ DEFAULT_ROW_GROUP_SIZE
+    let num_hosts = 2000;
+    let end_sec = start_sec + 510;
+    let generator = CpuDataGenerator::new(metadata.clone(), num_hosts, start_sec, end_sec);
+
+    // Build a BulkPart from all the generated data
+    let schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
+    let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
+
+    let mut converter = BulkPartConverter::new(
+        &metadata,
+        schema,
+        DEFAULT_ROW_GROUP_SIZE,
+        codec,
+        true, // store_pk_columns
+    );
+    for kvs in generator.iter() {
+        converter.append_key_values(&kvs).unwrap();
+    }
+    let bulk_part = converter.convert().unwrap();
+
+    // Encode to parquet
+    let encoder = BulkPartEncoder::new(metadata.clone(), DEFAULT_ROW_GROUP_SIZE).unwrap();
+    let encoded_part = encoder.encode_part(&bulk_part).unwrap().unwrap();
+
+    // Decode all record batches
+    let num_row_groups = encoded_part.metadata().parquet_metadata.num_row_groups();
+    let context = Arc::new(
+        BulkIterContext::new(
+            metadata.clone(),
+            None, // No projection
+            None, // No predicate
+            false,
+        )
+        .unwrap(),
+    );
+    let row_groups: VecDeque<usize> = (0..num_row_groups).collect();
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+
+    let mut group = c.benchmark_group("cache_flat_range_stream");
+    group.sample_size(10);
+
+    group.bench_function("baseline_iter_stream", |b| {
+        b.iter(|| {
+            rt.block_on(async {
+                let iter = EncodedBulkPartIter::try_new(
+                    &encoded_part,
+                    context.clone(),
+                    row_groups.clone(),
+                    None,
+                    None,
+                )
+                .unwrap();
+                let stream: mito2::read::BoxedRecordBatchStream =
+                    Box::pin(futures::stream::iter(iter));
+                let mut stream = stream;
+                while let Some(_batch) = stream.try_next().await.unwrap() {}
+            });
+        });
+    });
+
+    group.bench_function("cache_flat_range_stream", |b| {
+        b.iter(|| {
+            rt.block_on(async {
+                let iter = EncodedBulkPartIter::try_new(
+                    &encoded_part,
+                    context.clone(),
+                    row_groups.clone(),
+                    None,
+                    None,
+                )
+                .unwrap();
+                let stream: mito2::read::BoxedRecordBatchStream =
+                    Box::pin(futures::stream::iter(iter));
+                let mut stream = bench_cache_flat_range_stream(stream, 64 * 1024 * 1024, region_id);
+                while let Some(_batch) = stream.try_next().await.unwrap() {}
+            });
+        });
+    });
+}
+
+criterion_group!(benches, cache_flat_range_stream_bench);
+criterion_main!(benches);
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -12,15 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! Benchmarks for memtable operations: writes, full scans, filtered scans,
+//! bulk part conversion, record batch iteration with filters, and flat merge.
+//!
+//! Run with:
+//! ```sh
+//! cargo bench -p mito2 --features test --bench memtable_bench
+//! ```
+
 use std::sync::Arc;

-use api::v1::value::ValueData;
-use api::v1::{Row, Rows, SemanticType};
 use criterion::{Criterion, criterion_group, criterion_main};
-use datafusion_common::Column;
-use datafusion_expr::{Expr, lit};
-use datatypes::data_type::ConcreteDataType;
-use datatypes::schema::ColumnSchema;
 use mito_codec::row_converter::DensePrimaryKeyCodec;
 use mito2::memtable::bulk::context::BulkIterContext;
 use mito2::memtable::bulk::part::BulkPartConverter;
@@ -28,20 +30,13 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter;
 use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig};
 use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable};
 use mito2::memtable::time_series::TimeSeriesMemtable;
-use mito2::memtable::{IterBuilder, KeyValues, Memtable, RangesOptions};
+use mito2::memtable::{IterBuilder, Memtable, RangesOptions};
 use mito2::read::flat_merge::FlatMergeIterator;
 use mito2::read::scan_region::PredicateGroup;
 use mito2::region::options::MergeMode;
 use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
-use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
-use rand::Rng;
-use rand::rngs::ThreadRng;
-use rand::seq::IndexedRandom;
-use store_api::metadata::{
-    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
-};
-use store_api::storage::RegionId;
-use table::predicate::Predicate;
+use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
+use mito2::test_util::memtable_util;

 /// Writes rows.
 fn write_rows(c: &mut Criterion) {
@@ -216,224 +211,6 @@ fn filter_1_host(c: &mut Criterion) {
    });
 }

-struct Host {
-    hostname: String,
-    region: String,
-    datacenter: String,
-    rack: String,
-    os: String,
-    arch: String,
-    team: String,
-    service: String,
-    service_version: String,
-    service_environment: String,
-}
-
-impl Host {
-    fn random_with_id(id: usize) -> Host {
-        let mut rng = rand::rng();
-        let region = format!("ap-southeast-{}", rng.random_range(0..10));
-        let datacenter = format!(
-            "{}{}",
-            region,
-            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
-        );
-        Host {
-            hostname: format!("host_{id}"),
-            region,
-            datacenter,
-            rack: rng.random_range(0..100).to_string(),
-            os: "Ubuntu16.04LTS".to_string(),
-            arch: "x86".to_string(),
-            team: "CHI".to_string(),
-            service: rng.random_range(0..100).to_string(),
-            service_version: rng.random_range(0..10).to_string(),
-            service_environment: "test".to_string(),
-        }
-    }
-
-    fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
-        let tags = [
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.hostname.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.region.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.rack.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.os.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.arch.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.team.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service_version.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
-            },
-        ];
-        for tag in tags {
-            values.push(tag);
-        }
-    }
-}
-
-struct CpuDataGenerator {
-    metadata: RegionMetadataRef,
-    column_schemas: Vec<api::v1::ColumnSchema>,
-    hosts: Vec<Host>,
-    start_sec: i64,
-    end_sec: i64,
-}
-
-impl CpuDataGenerator {
-    fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
-        let column_schemas = region_metadata_to_row_schema(&metadata);
-        Self {
-            metadata,
-            column_schemas,
-            hosts: Self::generate_hosts(num_hosts),
-            start_sec,
-            end_sec,
-        }
-    }
-
-    fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
-        // point per 10s.
-        (self.start_sec..self.end_sec)
-            .step_by(10)
-            .enumerate()
-            .map(|(seq, ts)| self.build_key_values(seq, ts))
-    }
-
-    fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
-        let rows = self
-            .hosts
-            .iter()
-            .map(|host| {
-                let mut rng = rand::rng();
-                let mut values = Vec::with_capacity(21);
-                values.push(api::v1::Value {
-                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
-                });
-                host.fill_values(&mut values);
-                for _ in 0..10 {
-                    values.push(api::v1::Value {
-                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
-                    });
-                }
-                Row { values }
-            })
-            .collect();
-        let mutation = api::v1::Mutation {
-            op_type: api::v1::OpType::Put as i32,
-            sequence: seq as u64,
-            rows: Some(Rows {
-                schema: self.column_schemas.clone(),
-                rows,
-            }),
-            write_hint: None,
-        };
-
-        KeyValues::new(&self.metadata, mutation).unwrap()
-    }
-
-    fn random_host_filter(&self) -> Predicate {
-        let host = self.random_hostname();
-        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
-        Predicate::new(vec![expr])
-    }
-
-    fn random_host_filter_exprs(&self) -> Vec<Expr> {
-        let host = self.random_hostname();
-        vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
-    }
-
-    fn random_hostname(&self) -> String {
-        let mut rng = rand::rng();
-        self.hosts.choose(&mut rng).unwrap().hostname.clone()
-    }
-
-    fn random_f64(rng: &mut ThreadRng) -> f64 {
-        let base: u32 = rng.random_range(30..95);
-        base as f64
-    }
-
-    fn generate_hosts(num_hosts: usize) -> Vec<Host> {
-        (0..num_hosts).map(Host::random_with_id).collect()
-    }
-}
-
-/// Creates a metadata for TSBS cpu-like table.
-fn cpu_metadata() -> RegionMetadata {
-    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
-    builder.push_column_metadata(ColumnMetadata {
-        column_schema: ColumnSchema::new(
-            "ts",
-            ConcreteDataType::timestamp_millisecond_datatype(),
-            false,
-        ),
-        semantic_type: SemanticType::Timestamp,
-        column_id: 0,
-    });
-    let mut column_id = 1;
-    let tags = [
-        "hostname",
-        "region",
-        "datacenter",
-        "rack",
-        "os",
-        "arch",
-        "team",
-        "service",
-        "service_version",
-        "service_environment",
-    ];
-    for tag in tags {
-        builder.push_column_metadata(ColumnMetadata {
-            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
-            semantic_type: SemanticType::Tag,
-            column_id,
-        });
-        column_id += 1;
-    }
-    let fields = [
-        "usage_user",
-        "usage_system",
-        "usage_idle",
-        "usage_nice",
-        "usage_iowait",
-        "usage_irq",
-        "usage_softirq",
-        "usage_steal",
-        "usage_guest",
-        "usage_guest_nice",
-    ];
-    for field in fields {
-        builder.push_column_metadata(ColumnMetadata {
-            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
-            semantic_type: SemanticType::Field,
-            column_id,
-        });
-        column_id += 1;
-    }
-    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
-    builder.build().unwrap()
-}
-
 fn bulk_part_converter(c: &mut Criterion) {
    let metadata = Arc::new(cpu_metadata());
    let start_sec = 1710043200;
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -350,7 +350,7 @@ impl CacheStrategy {

    /// Calls [CacheManager::get_range_result()].
    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
-    #[cfg_attr(not(test), allow(dead_code))]
+    #[allow(dead_code)]
    pub(crate) fn get_range_result(
        &self,
        key: &RangeScanCacheKey,
@@ -363,7 +363,6 @@ impl CacheStrategy {

    /// Calls [CacheManager::put_range_result()].
    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
-    #[cfg_attr(not(test), allow(dead_code))]
    pub(crate) fn put_range_result(
        &self,
        key: RangeScanCacheKey,
@@ -476,7 +475,6 @@ pub struct CacheManager {
    /// Cache for time series selectors.
    selector_result_cache: Option<SelectorResultCache>,
    /// Cache for range scan outputs in flat format.
-    #[cfg_attr(not(test), allow(dead_code))]
    range_result_cache: Option<RangeResultCache>,
    /// Cache for index result.
    index_result_cache: Option<IndexResultCache>,
@@ -713,7 +711,7 @@ impl CacheManager {
    }

    /// Gets cached result for range scan.
-    #[cfg_attr(not(test), allow(dead_code))]
+    #[allow(dead_code)]
    pub(crate) fn get_range_result(
        &self,
        key: &RangeScanCacheKey,
@@ -723,8 +721,7 @@ impl CacheManager {
            .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE))
    }

-    /// Puts range scan result into the cache.
-    #[cfg_attr(not(test), allow(dead_code))]
+    /// Puts range scan result into cache.
    pub(crate) fn put_range_result(
        &self,
        key: RangeScanCacheKey,
@@ -949,7 +946,7 @@ impl CacheManagerBuilder {
            Cache::builder()
                .max_capacity(self.range_result_cache_size)
                .weigher(range_result_cache_weight)
-                .eviction_listener(|k, v, cause| {
+                .eviction_listener(move |k, v, cause| {
                    let size = range_result_cache_weight(&k, &v);
                    CACHE_BYTES
                        .with_label_values(&[RANGE_RESULT_TYPE])
@@ -1361,7 +1358,7 @@ mod tests {
            }
            .build(),
        };
-        let value = Arc::new(RangeScanCacheValue::new(Vec::new()));
+        let value = Arc::new(RangeScanCacheValue::new(Vec::new(), 0));

        assert!(cache.get_range_result(&key).is_none());
        cache.put_range_result(key.clone(), value.clone());
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -116,6 +116,8 @@ pub struct MitoConfig {
    pub page_cache_size: ReadableSize,
    /// Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.
    pub selector_result_cache_size: ReadableSize,
+    /// Cache size for flat range scan results. Setting it to 0 to disable the cache.
+    pub range_result_cache_size: ReadableSize,
    /// Whether to enable the write cache.
    pub enable_write_cache: bool,
    /// File system path for write cache dir's root, defaults to `{data_home}`.
@@ -200,6 +202,7 @@ impl Default for MitoConfig {
            vector_cache_size: ReadableSize::mb(512),
            page_cache_size: ReadableSize::mb(512),
            selector_result_cache_size: ReadableSize::mb(512),
+            range_result_cache_size: ReadableSize::mb(512),
            enable_write_cache: false,
            write_cache_path: String::new(),
            write_cache_size: ReadableSize::gb(5),
@@ -336,6 +339,7 @@ impl MitoConfig {
        self.vector_cache_size = mem_cache_size;
        self.page_cache_size = page_cache_size;
        self.selector_result_cache_size = mem_cache_size;
+        self.range_result_cache_size = mem_cache_size;

        self.index.adjust_buffer_and_cache_size(sys_memory);
    }
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -537,11 +537,15 @@ pub trait IterBuilder: Send + Sync {
    }

    /// Returns the record batch iterator to read the range.
+    /// ## Note
+    /// Implementations should ensure the iterator yields data within given time range.
    fn build_record_batch(
        &self,
+        time_range: Option<(Timestamp, Timestamp)>,
        metrics: Option<MemScanMetrics>,
    ) -> Result<BoxedRecordBatchIterator> {
        let _metrics = metrics;
+        let _ = time_range;
        UnsupportedOperationSnafu {
            err_msg: "Record batch iterator is not supported by this memtable",
        }
@@ -700,7 +704,7 @@ impl MemtableRange {
        metrics: Option<MemScanMetrics>,
    ) -> Result<BoxedRecordBatchIterator> {
        if self.context.builder.is_record_batch() {
-            return self.context.builder.build_record_batch(metrics);
+            return self.context.builder.build_record_batch(time_range, metrics);
        }

        if let Some(context) = self.context.batch_to_record_batch.as_ref() {
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -34,6 +34,7 @@ fn env_usize(name: &str, default: usize) -> usize {
        .unwrap_or(default)
 }

+use common_time::Timestamp;
 use datatypes::arrow::datatypes::SchemaRef;
 use mito_codec::key_values::KeyValue;
 use rayon::prelude::*;
@@ -792,6 +793,7 @@ impl IterBuilder for BulkRangeIterBuilder {

    fn build_record_batch(
        &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
        metrics: Option<MemScanMetrics>,
    ) -> Result<BoxedRecordBatchIterator> {
        let series_count = self.part.estimated_series_count();
@@ -825,6 +827,7 @@ impl IterBuilder for MultiBulkRangeIterBuilder {

    fn build_record_batch(
        &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
        metrics: Option<MemScanMetrics>,
    ) -> Result<BoxedRecordBatchIterator> {
        self.part
@@ -864,6 +867,7 @@ impl IterBuilder for EncodedBulkRangeIterBuilder {

    fn build_record_batch(
        &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
        metrics: Option<MemScanMetrics>,
    ) -> Result<BoxedRecordBatchIterator> {
        if let Some(iter) = self
--- a/src/mito2/src/memtable/bulk/part.rs
+++ b/src/mito2/src/memtable/bulk/part.rs
@@ -967,7 +967,7 @@ impl EncodedBulkPart {
        Self { data, metadata }
    }

-    pub(crate) fn metadata(&self) -> &BulkPartMeta {
+    pub fn metadata(&self) -> &BulkPartMeta {
        &self.metadata
    }

@@ -977,7 +977,7 @@ impl EncodedBulkPart {
    }

    /// Returns the encoded data.
-    pub(crate) fn data(&self) -> &Bytes {
+    pub fn data(&self) -> &Bytes {
        &self.data
    }

@@ -1121,10 +1121,7 @@ pub struct BulkPartEncoder {
 }

 impl BulkPartEncoder {
-    pub(crate) fn new(
-        metadata: RegionMetadataRef,
-        row_group_size: usize,
-    ) -> Result<BulkPartEncoder> {
+    pub fn new(metadata: RegionMetadataRef, row_group_size: usize) -> Result<BulkPartEncoder> {
        // TODO(yingwen): Skip arrow schema if needed.
        let json = metadata.to_json().context(InvalidMetadataSnafu)?;
        let key_value_meta =
@@ -1216,7 +1213,7 @@ impl BulkPartEncoder {
    }

    /// Encodes bulk part to a [EncodedBulkPart], returns the encoded data.
-    fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
+    pub fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
        if part.batch.num_rows() == 0 {
            return Ok(None);
        }
--- a/src/mito2/src/memtable/bulk/part_reader.rs
+++ b/src/mito2/src/memtable/bulk/part_reader.rs
@@ -50,7 +50,7 @@ pub struct EncodedBulkPartIter {

 impl EncodedBulkPartIter {
    /// Creates a new [BulkPartIter].
-    pub(crate) fn try_new(
+    pub fn try_new(
        encoded_part: &EncodedBulkPart,
        context: BulkIterContextRef,
        mut row_groups_to_read: VecDeque<usize>,
--- a/src/mito2/src/memtable/time_series.rs
+++ b/src/mito2/src/memtable/time_series.rs
@@ -51,15 +51,18 @@ use crate::memtable::bulk::part::BulkPart;
 use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable;
 use crate::memtable::stats::WriteMetrics;
 use crate::memtable::{
-    AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues,
-    MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext,
-    MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection,
+    AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, BoxedRecordBatchIterator,
+    IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange,
+    MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions,
+    read_column_ids_from_projection,
 };
 use crate::metrics::{
    MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT, MEMTABLE_ACTIVE_SERIES_COUNT, READ_ROWS_TOTAL,
    READ_STAGE_ELAPSED,
 };
 use crate::read::dedup::LastNonNullIter;
+use crate::read::prune::PruneTimeIterator;
+use crate::read::scan_region::PredicateGroup;
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::region::options::MergeMode;

@@ -283,25 +286,20 @@ impl Memtable for TimeSeriesMemtable {
                .map(|c| c.column_id)
                .collect()
        };
-        let builder = Box::new(TimeSeriesIterBuilder {
-            series_set: self.series_set.clone(),
-            projection,
-            predicate: predicate.predicate().cloned(),
-            dedup: self.dedup,
-            merge_mode: self.merge_mode,
-            sequence,
-        });
-        let adapter_context = Arc::new(BatchToRecordBatchContext::new(
+        let batch_to_record_batch = Arc::new(BatchToRecordBatchContext::new(
            self.region_metadata.clone(),
            read_column_ids,
        ));
-        let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch(
-            self.id,
-            builder,
-            predicate,
-            Some(adapter_context),
-        ));
-
+        let builder = Box::new(TimeSeriesIterBuilder {
+            series_set: self.series_set.clone(),
+            projection,
+            predicate: predicate.clone(),
+            dedup: self.dedup,
+            merge_mode: self.merge_mode,
+            sequence,
+            batch_to_record_batch,
+        });
+        let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
        let range_stats = self.stats();
        let range = MemtableRange::new(context, range_stats);
        Ok(MemtableRanges {
@@ -443,7 +441,7 @@ impl SeriesSet {
    fn iter_series(
        &self,
        projection: HashSet<ColumnId>,
-        predicate: Option<Predicate>,
+        predicate: PredicateGroup,
        dedup: bool,
        merge_mode: MergeMode,
        sequence: Option<SequenceRange>,
@@ -460,7 +458,7 @@ impl SeriesSet {
            self.region_metadata.clone(),
            self.series.clone(),
            projection,
-            predicate,
+            predicate.predicate().cloned(),
            primary_key_schema,
            primary_key_datatypes,
            self.codec.clone(),
@@ -1245,10 +1243,11 @@ impl From<ValueBuilder> for Values {
 struct TimeSeriesIterBuilder {
    series_set: SeriesSet,
    projection: HashSet<ColumnId>,
-    predicate: Option<Predicate>,
+    predicate: PredicateGroup,
    dedup: bool,
    sequence: Option<SequenceRange>,
    merge_mode: MergeMode,
+    batch_to_record_batch: Arc<BatchToRecordBatchContext>,
 }

 impl IterBuilder for TimeSeriesIterBuilder {
@@ -1268,6 +1267,25 @@ impl IterBuilder for TimeSeriesIterBuilder {
            Ok(Box::new(iter))
        }
    }
+
+    fn is_record_batch(&self) -> bool {
+        true
+    }
+
+    fn build_record_batch(
+        &self,
+        time_range: Option<(Timestamp, Timestamp)>,
+        metrics: Option<MemScanMetrics>,
+    ) -> Result<BoxedRecordBatchIterator> {
+        let iter = self.build(metrics)?;
+        let iter: BoxedBatchIterator = if let Some(time_range) = time_range {
+            let time_filters = self.predicate.time_filters();
+            Box::new(PruneTimeIterator::new(iter, time_range, time_filters))
+        } else {
+            iter
+        };
+        Ok(self.batch_to_record_batch.adapt_iter(iter))
+    }
 }

 #[cfg(test)]
@@ -2014,4 +2032,265 @@ mod tests {
        all_timestamps.sort();
        assert_eq!(vec![3, 4, 5, 6, 7], all_timestamps);
    }
+
+    /// Helper to create a TimeSeriesIterBuilder from a memtable and schema.
+    fn build_iter_builder(
+        schema: &RegionMetadataRef,
+        memtable: &TimeSeriesMemtable,
+        projection: Option<&[ColumnId]>,
+        dedup: bool,
+        merge_mode: MergeMode,
+        sequence: Option<SequenceRange>,
+    ) -> TimeSeriesIterBuilder {
+        let read_column_ids = read_column_ids_from_projection(schema, projection);
+        let field_projection = if let Some(projection) = projection {
+            projection.iter().copied().collect()
+        } else {
+            schema.field_columns().map(|c| c.column_id).collect()
+        };
+        let adapter_context = Arc::new(BatchToRecordBatchContext::new(
+            schema.clone(),
+            read_column_ids,
+        ));
+        TimeSeriesIterBuilder {
+            series_set: memtable.series_set.clone(),
+            projection: field_projection,
+            predicate: PredicateGroup::default(),
+            dedup,
+            merge_mode,
+            sequence,
+            batch_to_record_batch: adapter_context,
+        }
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_basic() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "hello".to_string(), 42, 10);
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(10, rb.num_rows());
+
+        let rb_schema = rb.schema();
+        let col_names: Vec<_> = rb_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        assert_eq!(
+            col_names,
+            vec![
+                "k0",
+                "k1",
+                "v0",
+                "v1",
+                "ts",
+                "__primary_key",
+                "__sequence",
+                "__op_type",
+            ]
+        );
+
+        assert!(iter.next().is_none());
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_with_projection() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "test".to_string(), 1, 5);
+        memtable.write(&kvs).unwrap();
+
+        // Project only field v0 (column_id=3) and ts (column_id=2).
+        let projection = vec![2, 3];
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            Some(&projection),
+            true,
+            MergeMode::LastRow,
+            None,
+        );
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(5, rb.num_rows());
+
+        let rb_schema = rb.schema();
+        let col_names: Vec<_> = rb_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        // Only projected columns + internal columns.
+        assert_eq!(
+            col_names,
+            vec!["v0", "ts", "__primary_key", "__sequence", "__op_type",]
+        );
+
+        assert!(iter.next().is_none());
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_multiple_series() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs_a = build_key_values(&schema, "aaa".to_string(), 1, 3);
+        let kvs_b = build_key_values(&schema, "bbb".to_string(), 2, 4);
+        memtable.write(&kvs_a).unwrap();
+        memtable.write(&kvs_b).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let mut total_rows = 0;
+        for rb in iter {
+            let rb = rb.unwrap();
+            total_rows += rb.num_rows();
+            assert_eq!(8, rb.num_columns());
+        }
+        assert_eq!(7, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_dedup() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        // Write same data twice — dedup should keep only one copy per timestamp.
+        let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
+        memtable.write(&kvs).unwrap();
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(5, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_no_dedup() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, false, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
+        memtable.write(&kvs).unwrap();
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, false, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(10, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_with_sequence_filter() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        // build_key_values creates a mutation with base sequence=0.
+        // Each row gets sequence = base + row_index, so 5 rows get sequences 0,1,2,3,4.
+        let kvs = build_key_values(&schema, "seq".to_string(), 1, 5);
+        memtable.write(&kvs).unwrap();
+
+        // Filter to sequence > 4 — should yield no rows.
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            None,
+            true,
+            MergeMode::LastRow,
+            Some(SequenceRange::Gt { min: 4 }),
+        );
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(0, total_rows);
+
+        // Filter to sequence <= 2 — should yield 3 rows (sequences 0, 1, 2).
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            None,
+            true,
+            MergeMode::LastRow,
+            Some(SequenceRange::LtEq { max: 2 }),
+        );
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(3, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_data_correctness() {
+        use datatypes::arrow::array::{
+            Float64Array, Int64Array, TimestampMillisecondArray, UInt8Array,
+        };
+
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "check".to_string(), 7, 3);
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(3, rb.num_rows());
+
+        // Verify timestamp values.
+        let ts_col = rb
+            .column_by_name("ts")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<TimestampMillisecondArray>()
+            .unwrap();
+        let timestamps: Vec<_> = (0..ts_col.len()).map(|i| ts_col.value(i)).collect();
+        assert_eq!(vec![0, 1, 2], timestamps);
+
+        // Verify field v0 values.
+        let v0_col = rb
+            .column_by_name("v0")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        let v0_values: Vec<_> = (0..v0_col.len()).map(|i| v0_col.value(i)).collect();
+        assert_eq!(vec![0, 1, 2], v0_values);
+
+        // Verify field v1 values.
+        let v1_col = rb
+            .column_by_name("v1")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+        let v1_values: Vec<_> = (0..v1_col.len()).map(|i| v1_col.value(i)).collect();
+        assert_eq!(vec![0.0, 1.0, 2.0], v1_values);
+
+        // Verify op_type is all Put (1).
+        let op_col = rb
+            .column_by_name("__op_type")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<UInt8Array>()
+            .unwrap();
+        for i in 0..op_col.len() {
+            assert_eq!(OpType::Put as u8, op_col.value(i));
+        }
+
+        assert!(iter.next().is_none());
+    }
 }
--- a/src/mito2/src/read.rs
+++ b/src/mito2/src/read.rs
@@ -27,6 +27,9 @@ pub mod projection;
 pub(crate) mod prune;
 pub(crate) mod pruner;
 pub mod range;
+#[cfg(feature = "test")]
+pub mod range_cache;
+#[cfg(not(feature = "test"))]
 pub(crate) mod range_cache;
 pub mod scan_region;
 pub mod scan_util;
--- a/src/mito2/src/read/flat_projection.rs
+++ b/src/mito2/src/read/flat_projection.rs
@@ -18,18 +18,21 @@ use std::sync::Arc;

 use api::v1::SemanticType;
 use common_error::ext::BoxedError;
-use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu};
+use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu};
 use common_recordbatch::{DfRecordBatch, RecordBatch};
-use datatypes::arrow::datatypes::Field;
+use datatypes::arrow::array::Array;
+use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::schema::{Schema, SchemaRef};
+use datatypes::value::Value;
 use datatypes::vectors::Helper;
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::{RegionMetadata, RegionMetadataRef};
 use store_api::storage::ColumnId;

+use crate::cache::CacheStrategy;
 use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result};
-use crate::read::projection::read_column_ids_from_projection;
+use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache};
 use crate::sst::parquet::flat_format::sst_column_id_indices;
 use crate::sst::parquet::format::FormatProjection;
 use crate::sst::{
@@ -248,12 +251,55 @@ impl FlatProjectionMapper {
    pub(crate) fn convert(
        &self,
        batch: &datatypes::arrow::record_batch::RecordBatch,
+        cache_strategy: &CacheStrategy,
    ) -> common_recordbatch::error::Result<RecordBatch> {
        if self.is_empty_projection {
            return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows());
        }
-        let columns = self.project_vectors(batch)?;
-        RecordBatch::new(self.output_schema.clone(), columns)
+        // Construct output record batch directly from Arrow arrays to avoid
+        // Arrow -> Vector -> Arrow roundtrips in the hot path.
+        let mut arrays = Vec::with_capacity(self.output_schema.num_columns());
+        for (output_idx, index) in self.batch_indices.iter().enumerate() {
+            let mut array = batch.column(*index).clone();
+            // Cast dictionary values to the target type.
+            if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() {
+                // When a string dictionary column contains only a single value, reuse a cached
+                // repeated vector to avoid repeatedly expanding the dictionary.
+                if let Some(dict_array) = single_value_string_dictionary(
+                    &array,
+                    &self.output_schema.column_schemas()[output_idx].data_type,
+                    value_type.as_ref(),
+                ) {
+                    let dict_values = dict_array.values();
+                    let value = if dict_values.is_null(0) {
+                        Value::Null
+                    } else {
+                        Value::from(datatypes::arrow_array::string_array_value(dict_values, 0))
+                    };
+
+                    let repeated = repeated_vector_with_cache(
+                        &self.output_schema.column_schemas()[output_idx].data_type,
+                        &value,
+                        batch.num_rows(),
+                        cache_strategy,
+                    )?;
+                    array = repeated.to_arrow_array();
+                } else {
+                    let casted = datatypes::arrow::compute::cast(&array, value_type)
+                        .context(ArrowComputeSnafu)?;
+                    array = casted;
+                }
+            }
+            arrays.push(array);
+        }
+
+        let df_record_batch =
+            DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays)
+                .context(NewDfRecordBatchSnafu)?;
+        Ok(RecordBatch::from_df_record_batch(
+            self.output_schema.clone(),
+            df_record_batch,
+        ))
    }

    /// Projects columns from the input batch and converts them into vectors.
@@ -281,6 +327,28 @@ impl FlatProjectionMapper {
    }
 }

+fn single_value_string_dictionary<'a>(
+    array: &'a Arc<dyn Array>,
+    output_type: &ConcreteDataType,
+    value_type: &ArrowDataType,
+) -> Option<&'a datatypes::arrow::array::DictionaryArray<datatypes::arrow::datatypes::UInt32Type>> {
+    if !matches!(
+        value_type,
+        ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View
+    ) || !output_type.is_string()
+    {
+        return None;
+    }
+
+    let dict_array = array
+        .as_any()
+        .downcast_ref::<datatypes::arrow::array::DictionaryArray<
+            datatypes::arrow::datatypes::UInt32Type,
+        >>()?;
+
+    (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array)
+}
+
 /// Returns ids and datatypes of columns of the output batch after applying the `projection`.
 ///
 /// It adds the time index column if it doesn't present in the projection.
--- a/src/mito2/src/read/projection.rs
+++ b/src/mito2/src/read/projection.rs
@@ -21,7 +21,7 @@ use std::sync::Arc;
 use api::v1::SemanticType;
 use common_error::ext::BoxedError;
 use common_recordbatch::RecordBatch;
-use common_recordbatch::error::ExternalSnafu;
+use common_recordbatch::error::{DataTypesSnafu, ExternalSnafu};
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::schema::{Schema, SchemaRef};
 use datatypes::value::Value;
@@ -37,7 +37,7 @@ use crate::read::Batch;
 use crate::read::flat_projection::FlatProjectionMapper;

 /// Only cache vector when its length `<=` this value.
-const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
+pub(crate) const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;

 /// Wrapper enum for different projection mapper implementations.
 pub enum ProjectionMapper {
@@ -423,7 +423,7 @@ enum BatchIndex {
 }

 /// Gets a vector with repeated values from specific cache or creates a new one.
-fn repeated_vector_with_cache(
+pub(crate) fn repeated_vector_with_cache(
    data_type: &ConcreteDataType,
    value: &Value,
    num_rows: usize,
@@ -450,7 +450,7 @@ fn repeated_vector_with_cache(
 }

 /// Returns a vector with repeated values.
-fn new_repeated_vector(
+pub(crate) fn new_repeated_vector(
    data_type: &ConcreteDataType,
    value: &Value,
    num_rows: usize,
@@ -458,8 +458,7 @@ fn new_repeated_vector(
    let mut mutable_vector = data_type.create_mutable_vector(1);
    mutable_vector
        .try_push_value_ref(&value.as_value_ref())
-        .map_err(BoxedError::new)
-        .context(ExternalSnafu)?;
+        .context(DataTypesSnafu)?;
    // This requires an additional allocation.
    let base_vector = mutable_vector.to_vector();
    Ok(base_vector.replicate(&[num_rows]))
@@ -809,6 +808,7 @@ mod tests {
                .num_fields(2)
                .build(),
        );
+        let cache = CacheStrategy::Disabled;
        let mapper = ProjectionMapper::all(&metadata, true).unwrap();
        assert_eq!([0, 1, 2, 3, 4], mapper.column_ids());
        assert_eq!(
@@ -823,7 +823,7 @@ mod tests {
        );

        let batch = new_flat_batch(Some(0), &[(1, 1), (2, 2)], &[(3, 3), (4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
        let expect = "\
 +---------------------+----+----+----+----+
 | ts                  | k0 | k1 | v0 | v1 |
@@ -843,6 +843,7 @@ mod tests {
                .num_fields(2)
                .build(),
        );
+        let cache = CacheStrategy::Disabled;
        // Columns v1, k0
        let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap();
        assert_eq!([4, 1], mapper.column_ids());
@@ -856,7 +857,7 @@ mod tests {
        );

        let batch = new_flat_batch(None, &[(1, 1)], &[(4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
        let expect = "\
 +----+----+
 | v1 | k0 |
@@ -876,6 +877,7 @@ mod tests {
                .num_fields(2)
                .build(),
        );
+        let cache = CacheStrategy::Disabled;
        // Output columns v1, k0. Read also includes v0.
        let mapper = ProjectionMapper::new_with_read_columns(
            &metadata,
@@ -887,7 +889,7 @@ mod tests {
        assert_eq!([4, 1, 3], mapper.column_ids());

        let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
        let expect = "\
 +----+----+
 | v1 | k0 |
@@ -907,6 +909,7 @@ mod tests {
                .num_fields(2)
                .build(),
        );
+        let cache = CacheStrategy::Disabled;
        // Empty projection
        let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap();
        assert_eq!([0], mapper.column_ids()); // Should still read the time index column
@@ -918,7 +921,7 @@ mod tests {
        );

        let batch = new_flat_batch(Some(0), &[], &[], 3);
-        let record_batch = flat_mapper.convert(&batch).unwrap();
+        let record_batch = flat_mapper.convert(&batch, &cache).unwrap();
        assert_eq!(3, record_batch.num_rows());
        assert_eq!(0, record_batch.num_columns());
        assert!(record_batch.schema.is_empty());
--- a/src/mito2/src/read/range_cache.rs
+++ b/src/mito2/src/read/range_cache.rs
@@ -17,12 +17,23 @@
 use std::mem;
 use std::sync::Arc;

+use async_stream::try_stream;
+use common_time::range::TimestampRange;
+use datatypes::arrow::array::{Array, AsArray, DictionaryArray};
+use datatypes::arrow::datatypes::UInt32Type;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::ConcreteDataType;
+use futures::TryStreamExt;
+use store_api::region_engine::PartitionRange;
 use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector};

-use crate::memtable::record_batch_estimated_size;
+use crate::cache::CacheStrategy;
+use crate::read::BoxedRecordBatchStream;
+use crate::read::scan_region::StreamContext;
+use crate::read::scan_util::PartitionMetrics;
 use crate::region::options::MergeMode;
+use crate::sst::file::FileTimeRange;
+use crate::sst::parquet::flat_format::primary_key_column_index;

 /// Fingerprint of the scan request fields that affect partition range cache reuse.
 ///
@@ -124,7 +135,6 @@ impl ScanRequestFingerprint {
            .unwrap_or(&[])
    }

-    #[cfg(test)]
    pub(crate) fn without_time_filters(&self) -> Self {
        Self {
            inner: Arc::clone(&self.inner),
@@ -163,7 +173,7 @@ impl ScanRequestFingerprint {
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub(crate) struct RangeScanCacheKey {
    pub(crate) region_id: RegionId,
-    /// Sorted (file_id, row_group_index) pairs that uniquely identify the covered data.
+    /// Sorted (file_id, row_group_index) pairs that uniquely identify the data this range covers.
    pub(crate) row_groups: Vec<(FileId, i64)>,
    pub(crate) scan: ScanRequestFingerprint,
 }
@@ -179,30 +189,458 @@ impl RangeScanCacheKey {
 /// Cached result for one range scan.
 pub(crate) struct RangeScanCacheValue {
    pub(crate) batches: Vec<RecordBatch>,
+    /// Precomputed size of all batches, accounting for shared dictionary values.
+    estimated_batches_size: usize,
 }

 impl RangeScanCacheValue {
-    #[cfg_attr(not(test), allow(dead_code))]
-    pub(crate) fn new(batches: Vec<RecordBatch>) -> Self {
-        Self { batches }
+    pub(crate) fn new(batches: Vec<RecordBatch>, estimated_batches_size: usize) -> Self {
+        Self {
+            batches,
+            estimated_batches_size,
+        }
    }

    pub(crate) fn estimated_size(&self) -> usize {
        mem::size_of::<Self>()
            + self.batches.capacity() * mem::size_of::<RecordBatch>()
-            + self
-                .batches
-                .iter()
-                .map(record_batch_estimated_size)
-                .sum::<usize>()
+            + self.estimated_batches_size
    }
 }

+/// Row groups and whether all sources are file-only for a partition range.
+#[allow(dead_code)]
+pub(crate) struct PartitionRangeRowGroups {
+    /// Sorted (file_id, row_group_index) pairs.
+    pub(crate) row_groups: Vec<(FileId, i64)>,
+    pub(crate) only_file_sources: bool,
+}
+
+/// Collects (file_id, row_group_index) pairs from a partition range's row group indices.
+#[allow(dead_code)]
+pub(crate) fn collect_partition_range_row_groups(
+    stream_ctx: &StreamContext,
+    part_range: &PartitionRange,
+) -> PartitionRangeRowGroups {
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let mut row_groups = Vec::new();
+    let mut only_file_sources = true;
+
+    for index in &range_meta.row_group_indices {
+        if stream_ctx.is_file_range_index(*index) {
+            let file_id = stream_ctx.input.file_from_index(*index).file_id().file_id();
+            row_groups.push((file_id, index.row_group_index));
+        } else {
+            only_file_sources = false;
+        }
+    }
+
+    row_groups.sort_unstable_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()).then(a.1.cmp(&b.1)));
+
+    PartitionRangeRowGroups {
+        row_groups,
+        only_file_sources,
+    }
+}
+
+/// Builds a cache key for the given partition range if it is eligible for caching.
+#[allow(dead_code)]
+pub(crate) fn build_range_cache_key(
+    stream_ctx: &StreamContext,
+    part_range: &PartitionRange,
+) -> Option<RangeScanCacheKey> {
+    let fingerprint = stream_ctx.scan_fingerprint.as_ref()?;
+
+    // Dyn filters can change at runtime, so we can't cache when they're present.
+    let has_dyn_filters = stream_ctx
+        .input
+        .predicate_group()
+        .predicate_without_region()
+        .is_some_and(|p| !p.dyn_filters().is_empty());
+    if has_dyn_filters {
+        return None;
+    }
+
+    let rg = collect_partition_range_row_groups(stream_ctx, part_range);
+    if !rg.only_file_sources || rg.row_groups.is_empty() {
+        return None;
+    }
+
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let scan = if query_time_range_covers_partition_range(
+        stream_ctx.input.time_range.as_ref(),
+        range_meta.time_range,
+    ) {
+        fingerprint.without_time_filters()
+    } else {
+        fingerprint.clone()
+    };
+
+    Some(RangeScanCacheKey {
+        region_id: stream_ctx.input.region_metadata().region_id,
+        row_groups: rg.row_groups,
+        scan,
+    })
+}
+
+#[allow(dead_code)]
+fn query_time_range_covers_partition_range(
+    query_time_range: Option<&TimestampRange>,
+    partition_time_range: FileTimeRange,
+) -> bool {
+    let Some(query_time_range) = query_time_range else {
+        return true;
+    };
+
+    let (part_start, part_end) = partition_time_range;
+    query_time_range.contains(&part_start) && query_time_range.contains(&part_end)
+}
+
+/// Returns a stream that replays cached record batches.
+#[allow(dead_code)]
+pub(crate) fn cached_flat_range_stream(value: Arc<RangeScanCacheValue>) -> BoxedRecordBatchStream {
+    Box::pin(futures::stream::iter(
+        value.batches.clone().into_iter().map(Ok),
+    ))
+}
+
+/// Returns true if two primary key dictionary arrays share the same underlying
+/// values buffers by pointer comparison.
+///
+/// The primary key column is always `DictionaryArray<UInt32Type>` with `Binary` values.
+fn pk_values_ptr_eq(a: &DictionaryArray<UInt32Type>, b: &DictionaryArray<UInt32Type>) -> bool {
+    let a = a.values().as_binary::<i32>();
+    let b = b.values().as_binary::<i32>();
+    let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets());
+    match (a.nulls(), b.nulls()) {
+        (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()),
+        (None, None) => values_eq,
+        _ => false,
+    }
+}
+
+/// Buffers record batches for caching, tracking memory size while deduplicating
+/// shared dictionary values across batches.
+///
+/// Uses the primary key column as a proxy to detect dictionary sharing: if the PK
+/// column's dictionary values are pointer-equal across batches, we assume all
+/// dictionary columns share their values and deduct the total dictionary values size.
+struct CacheBatchBuffer {
+    batches: Vec<RecordBatch>,
+    /// Running total of batch memory.
+    total_size: usize,
+    /// The first batch's PK dictionary array, for pointer comparison.
+    /// `None` if no dictionary PK column exists or no batch has been added yet.
+    first_pk_dict: Option<DictionaryArray<UInt32Type>>,
+    /// Sum of `get_array_memory_size()` of all dictionary value arrays from the first batch.
+    total_dict_values_size: usize,
+    /// Whether the PK dictionary is still shared across all batches seen so far.
+    shared: bool,
+}
+
+impl CacheBatchBuffer {
+    fn new() -> Self {
+        Self {
+            batches: Vec::new(),
+            total_size: 0,
+            first_pk_dict: None,
+            total_dict_values_size: 0,
+            shared: true,
+        }
+    }
+
+    fn push(&mut self, batch: RecordBatch) {
+        if self.batches.is_empty() {
+            self.init_first_batch(&batch);
+        } else {
+            self.add_subsequent_batch(&batch);
+        }
+        self.batches.push(batch);
+    }
+
+    fn init_first_batch(&mut self, batch: &RecordBatch) {
+        self.total_size += batch.get_array_memory_size();
+
+        let pk_col_idx = primary_key_column_index(batch.num_columns());
+        let mut total_dict_values_size = 0;
+        for col_idx in 0..batch.num_columns() {
+            let col = batch.column(col_idx);
+            if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>() {
+                total_dict_values_size += dict.values().get_array_memory_size();
+                if col_idx == pk_col_idx {
+                    self.first_pk_dict = Some(dict.clone());
+                }
+            }
+        }
+        self.total_dict_values_size = total_dict_values_size;
+    }
+
+    fn add_subsequent_batch(&mut self, batch: &RecordBatch) {
+        let batch_size = batch.get_array_memory_size();
+
+        if self.shared
+            && let Some(first_pk_dict) = &self.first_pk_dict
+        {
+            let pk_col_idx = primary_key_column_index(batch.num_columns());
+            let col = batch.column(pk_col_idx);
+            if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>()
+                && pk_values_ptr_eq(first_pk_dict, dict)
+            {
+                // PK dict is shared, deduct all dict values sizes.
+                self.total_size += batch_size - self.total_dict_values_size;
+                return;
+            }
+            // Dictionary diverged.
+            self.shared = false;
+        }
+
+        self.total_size += batch_size;
+    }
+
+    fn estimated_batches_size(&self) -> usize {
+        self.total_size
+    }
+
+    fn into_batches(self) -> Vec<RecordBatch> {
+        self.batches
+    }
+}
+
+/// Wraps a stream to cache its output for future range cache hits.
+#[allow(dead_code)]
+pub(crate) fn cache_flat_range_stream(
+    mut stream: BoxedRecordBatchStream,
+    cache_strategy: CacheStrategy,
+    key: RangeScanCacheKey,
+    part_metrics: PartitionMetrics,
+) -> BoxedRecordBatchStream {
+    Box::pin(try_stream! {
+        let mut buffer = CacheBatchBuffer::new();
+        while let Some(batch) = stream.try_next().await? {
+            buffer.push(batch.clone());
+            yield batch;
+        }
+
+        let estimated_size = buffer.estimated_batches_size();
+        let batches = buffer.into_batches();
+        let value = Arc::new(RangeScanCacheValue::new(batches, estimated_size));
+        part_metrics.inc_range_cache_size(key.estimated_size() + value.estimated_size());
+        cache_strategy.put_range_result(key, value);
+    })
+}
+
+/// Creates a `cache_flat_range_stream` with dummy internals for benchmarking.
+///
+/// This avoids exposing `RangeScanCacheKey`, `ScanRequestFingerprint`, and
+/// `PartitionMetrics` publicly.
+#[cfg(feature = "test")]
+pub fn bench_cache_flat_range_stream(
+    stream: BoxedRecordBatchStream,
+    cache_size_bytes: u64,
+    region_id: RegionId,
+) -> BoxedRecordBatchStream {
+    use std::time::Instant;
+
+    use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+
+    use crate::region::options::MergeMode;
+
+    let cache_manager = Arc::new(
+        crate::cache::CacheManager::builder()
+            .range_result_cache_size(cache_size_bytes)
+            .build(),
+    );
+    let cache_strategy = CacheStrategy::EnableAll(cache_manager);
+
+    let fingerprint = ScanRequestFingerprintBuilder {
+        read_column_ids: vec![],
+        read_column_types: vec![],
+        filters: vec![],
+        time_filters: vec![],
+        series_row_selector: None,
+        append_mode: false,
+        filter_deleted: false,
+        merge_mode: MergeMode::LastRow,
+        partition_expr_version: 0,
+    }
+    .build();
+
+    let key = RangeScanCacheKey {
+        region_id,
+        row_groups: vec![],
+        scan: fingerprint,
+    };
+
+    let metrics_set = ExecutionPlanMetricsSet::new();
+    let part_metrics =
+        PartitionMetrics::new(region_id, 0, "bench", Instant::now(), false, &metrics_set);
+
+    cache_flat_range_stream(stream, cache_strategy, key, part_metrics)
+}
+
 #[cfg(test)]
 mod tests {
-    use store_api::storage::TimeSeriesRowSelector;
+    use std::sync::Arc;
+    use std::time::Instant;
+
+    use common_time::Timestamp;
+    use common_time::range::TimestampRange;
+    use common_time::timestamp::TimeUnit;
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{Expr, col, lit};
+    use smallvec::smallvec;
+    use store_api::storage::FileId;

    use super::*;
+    use crate::cache::CacheManager;
+    use crate::read::projection::ProjectionMapper;
+    use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex};
+    use crate::read::scan_region::{PredicateGroup, ScanInput};
+    use crate::test_util::memtable_util::metadata_with_primary_key;
+    use crate::test_util::scheduler_util::SchedulerEnv;
+    use crate::test_util::sst_util::sst_file_handle_with_file_id;
+
+    fn test_cache_strategy() -> CacheStrategy {
+        CacheStrategy::EnableAll(Arc::new(
+            CacheManager::builder()
+                .range_result_cache_size(1024)
+                .build(),
+        ))
+    }
+
+    async fn new_stream_context(
+        filters: Vec<Expr>,
+        query_time_range: Option<TimestampRange>,
+        partition_time_range: FileTimeRange,
+    ) -> (StreamContext, PartitionRange) {
+        let env = SchedulerEnv::new().await;
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap();
+        let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap();
+        let file_id = FileId::random();
+        let file = sst_file_handle_with_file_id(
+            file_id,
+            partition_time_range.0.value(),
+            partition_time_range.1.value(),
+        );
+        let input = ScanInput::new(env.access_layer.clone(), mapper)
+            .with_predicate(predicate)
+            .with_time_range(query_time_range)
+            .with_files(vec![file])
+            .with_cache(test_cache_strategy())
+            .with_flat_format(true);
+        let range_meta = RangeMeta {
+            time_range: partition_time_range,
+            indices: smallvec![SourceIndex {
+                index: 0,
+                num_row_groups: 1,
+            }],
+            row_group_indices: smallvec![RowGroupIndex {
+                index: 0,
+                row_group_index: 0,
+            }],
+            num_rows: 10,
+        };
+        let partition_range = range_meta.new_partition_range(0);
+        let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input);
+        let stream_ctx = StreamContext {
+            input,
+            ranges: vec![range_meta],
+            scan_fingerprint,
+            query_start: Instant::now(),
+        };
+
+        (stream_ctx, partition_range)
+    }
+
+    /// Helper to create a timestamp millisecond literal.
+    fn ts_lit(val: i64) -> Expr {
+        lit(ScalarValue::TimestampMillisecond(Some(val), None))
+    }
+
+    #[tokio::test]
+    async fn strips_time_only_filters_when_query_covers_partition_range() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(1000)),
+                col("ts").lt(ts_lit(2001)),
+                col("ts").is_not_null(),
+                col("k0").eq(lit("foo")),
+            ],
+            TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Range-reducible time filters should be cleared when query covers partition range.
+        assert!(key.scan.time_filters().is_empty());
+        // Non-range time predicates stay in filters.
+        let mut expected_filters = [
+            col("k0").eq(lit("foo")).to_string(),
+            col("ts").is_not_null().to_string(),
+        ];
+        expected_filters.sort_unstable();
+        assert_eq!(key.scan.filters(), expected_filters.as_slice());
+    }
+
+    #[tokio::test]
+    async fn preserves_time_filters_when_query_does_not_cover_partition_range() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))],
+            TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond),
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Time filters should be preserved when query does not cover partition range.
+        assert_eq!(
+            key.scan.time_filters(),
+            [col("ts").gt_eq(ts_lit(1000)).to_string()].as_slice()
+        );
+        assert_eq!(
+            key.scan.filters(),
+            [col("k0").eq(lit("foo")).to_string()].as_slice()
+        );
+    }
+
+    #[tokio::test]
+    async fn strips_time_only_filters_when_query_has_no_time_range_limit() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(1000)),
+                col("ts").is_not_null(),
+                col("k0").eq(lit("foo")),
+            ],
+            None,
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Range-reducible time filters should be cleared when query has no time range limit.
+        assert!(key.scan.time_filters().is_empty());
+        // Non-range time predicates stay in filters.
+        let mut expected_filters = [
+            col("k0").eq(lit("foo")).to_string(),
+            col("ts").is_not_null().to_string(),
+        ];
+        expected_filters.sort_unstable();
+        assert_eq!(key.scan.filters(), expected_filters.as_slice());
+    }

    #[test]
    fn normalizes_and_clears_time_filters() {
@@ -249,4 +687,170 @@ mod tests {
            fingerprint.partition_expr_version
        );
    }
+
+    /// Creates a test schema with 5 columns where the primary key dictionary column
+    /// is at index 2 (`num_columns - 3`), matching the flat format layout.
+    ///
+    /// Layout: `[field0: Int64, field1: Int64, pk: Dictionary<UInt32,Binary>, ts: Int64, seq: Int64]`
+    fn dict_test_schema() -> Arc<datatypes::arrow::datatypes::Schema> {
+        use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+        Arc::new(Schema::new(vec![
+            Field::new("field0", ArrowDataType::Int64, false),
+            Field::new("field1", ArrowDataType::Int64, false),
+            Field::new(
+                "pk",
+                ArrowDataType::Dictionary(
+                    Box::new(ArrowDataType::UInt32),
+                    Box::new(ArrowDataType::Binary),
+                ),
+                false,
+            ),
+            Field::new("ts", ArrowDataType::Int64, false),
+            Field::new("seq", ArrowDataType::Int64, false),
+        ]))
+    }
+
+    /// Helper to create a record batch with a dictionary column at the primary key position.
+    fn make_dict_batch(
+        schema: Arc<datatypes::arrow::datatypes::Schema>,
+        dict_values: &datatypes::arrow::array::BinaryArray,
+        keys: &[u32],
+        int_values: &[i64],
+    ) -> RecordBatch {
+        use datatypes::arrow::array::{Int64Array, UInt32Array};
+
+        let key_array = UInt32Array::from(keys.to_vec());
+        let dict_array: DictionaryArray<UInt32Type> =
+            DictionaryArray::new(key_array, Arc::new(dict_values.clone()));
+        let int_array = Int64Array::from(int_values.to_vec());
+        let zeros = Int64Array::from(vec![0i64; int_values.len()]);
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(zeros.clone()),
+                Arc::new(int_array),
+                Arc::new(dict_array),
+                Arc::new(zeros.clone()),
+                Arc::new(zeros),
+            ],
+        )
+        .unwrap()
+    }
+
+    /// Computes the total `get_array_memory_size()` of all dictionary value arrays in a batch.
+    fn compute_total_dict_values_size(batch: &RecordBatch) -> usize {
+        batch
+            .columns()
+            .iter()
+            .filter_map(|col| {
+                col.as_any()
+                    .downcast_ref::<DictionaryArray<UInt32Type>>()
+                    .map(|dict| dict.values().get_array_memory_size())
+            })
+            .sum()
+    }
+
+    #[test]
+    fn cache_batch_buffer_empty() {
+        let buffer = CacheBatchBuffer::new();
+        assert_eq!(buffer.estimated_batches_size(), 0);
+        assert!(buffer.into_batches().is_empty());
+    }
+
+    #[test]
+    fn cache_batch_buffer_single_batch() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
+        let batch = make_dict_batch(schema, &dict_values, &[0, 1, 2], &[10, 20, 30]);
+
+        let full_size = batch.get_array_memory_size();
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch);
+        assert_eq!(buffer.estimated_batches_size(), full_size);
+        assert_eq!(buffer.into_batches().len(), 1);
+    }
+
+    #[test]
+    fn cache_batch_buffer_shared_dictionary() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values = BinaryArray::from_vec(vec![b"alpha", b"beta", b"gamma"]);
+
+        // Two batches sharing the same dictionary values array.
+        let batch1 = make_dict_batch(schema.clone(), &dict_values, &[0, 1], &[10, 20]);
+        let batch2 = make_dict_batch(schema, &dict_values, &[1, 2], &[30, 40]);
+
+        let batch1_full = batch1.get_array_memory_size();
+        let batch2_full = batch2.get_array_memory_size();
+
+        // The total dictionary values size that should be deduplicated for the second batch.
+        let dict_values_size = compute_total_dict_values_size(&batch2);
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+
+        // Second batch's dict values should not be counted again.
+        assert_eq!(
+            buffer.estimated_batches_size(),
+            batch1_full + batch2_full - dict_values_size
+        );
+        assert_eq!(buffer.into_batches().len(), 2);
+    }
+
+    #[test]
+    fn cache_batch_buffer_non_shared_dictionary() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values1 = BinaryArray::from_vec(vec![b"a", b"b"]);
+        let dict_values2 = BinaryArray::from_vec(vec![b"x", b"y"]);
+
+        let batch1 = make_dict_batch(schema.clone(), &dict_values1, &[0, 1], &[10, 20]);
+        let batch2 = make_dict_batch(schema, &dict_values2, &[0, 1], &[30, 40]);
+
+        let batch1_full = batch1.get_array_memory_size();
+        let batch2_full = batch2.get_array_memory_size();
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+
+        // Different dictionaries: full size for both.
+        assert_eq!(buffer.estimated_batches_size(), batch1_full + batch2_full);
+    }
+
+    #[test]
+    fn cache_batch_buffer_shared_then_diverged() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let shared_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
+        let different_values = BinaryArray::from_vec(vec![b"x", b"y"]);
+
+        let batch1 = make_dict_batch(schema.clone(), &shared_values, &[0], &[1]);
+        let batch2 = make_dict_batch(schema.clone(), &shared_values, &[1], &[2]);
+        let batch3 = make_dict_batch(schema, &different_values, &[0], &[3]);
+
+        let size1 = batch1.get_array_memory_size();
+        let size2 = batch2.get_array_memory_size();
+        let size3 = batch3.get_array_memory_size();
+
+        let dict_values_size = compute_total_dict_values_size(&batch2);
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+        buffer.push(batch3);
+
+        // batch2 shares dict with batch1 (dedup), batch3 does not (full size).
+        assert_eq!(
+            buffer.estimated_batches_size(),
+            size1 + (size2 - dict_values_size) + size3
+        );
+    }
 }
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -40,7 +40,7 @@ use store_api::region_engine::{PartitionRange, RegionScannerRef};
 use store_api::storage::{
    ColumnId, RegionId, ScanRequest, SequenceRange, TimeSeriesDistribution, TimeSeriesRowSelector,
 };
-use table::predicate::{Predicate, build_time_range_predicate};
+use table::predicate::{Predicate, build_time_range_predicate, extract_time_range_from_expr};
 use tokio::sync::{Semaphore, mpsc};
 use tokio_stream::wrappers::ReceiverStream;

@@ -1420,7 +1420,6 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {

 /// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible
 /// for partition range caching.
-#[cfg_attr(not(test), allow(dead_code))]
 pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFingerprint> {
    let eligible = input.flat_format
        && !input.compaction
@@ -1439,7 +1438,14 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
        .map(|col| col.column_schema.name.as_str())
        .collect();

-    let time_index_name = metadata.time_index_column().column_schema.name.clone();
+    let time_index = metadata.time_index_column();
+    let time_index_name = time_index.column_schema.name.clone();
+    let ts_col_unit = time_index
+        .column_schema
+        .data_type
+        .as_timestamp()
+        .expect("Time index must have timestamp-compatible type")
+        .unit();

    let exprs = input
        .predicate_group()
@@ -1464,9 +1470,16 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
            _ => false,
        };

-        if is_time_only {
+        if is_time_only
+            && extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some()
+        {
+            // Range-reducible time predicates can be safely dropped from the
+            // cache key when the query time range covers the partition range.
            time_filters.push(expr.to_string());
        } else {
+            // Non-time filters and non-range time predicates (those that
+            // extract_time_range_from_expr cannot convert to a TimestampRange)
+            // always stay in the cache key.
            filters.push(expr.to_string());
        }
    }
@@ -1511,6 +1524,10 @@ pub struct StreamContext {
    pub input: ScanInput,
    /// Metadata for partition ranges.
    pub(crate) ranges: Vec<RangeMeta>,
+    /// Precomputed scan fingerprint for partition range caching.
+    /// `None` when the scan is not eligible for caching.
+    #[allow(dead_code)]
+    pub(crate) scan_fingerprint: Option<ScanRequestFingerprint>,

    // Metrics:
    /// The start time of the query.
@@ -1523,10 +1540,12 @@ impl StreamContext {
        let query_start = input.query_start.unwrap_or_else(Instant::now);
        let ranges = RangeMeta::seq_scan_ranges(&input);
        READ_SST_COUNT.observe(input.num_files() as f64);
+        let scan_fingerprint = build_scan_fingerprint(&input);

        Self {
            input,
            ranges,
+            scan_fingerprint,
            query_start,
        }
    }
@@ -1536,10 +1555,12 @@ impl StreamContext {
        let query_start = input.query_start.unwrap_or_else(Instant::now);
        let ranges = RangeMeta::unordered_scan_ranges(&input);
        READ_SST_COUNT.observe(input.num_files() as f64);
+        let scan_fingerprint = build_scan_fingerprint(&input);

        Self {
            input,
            ranges,
+            scan_fingerprint,
            query_start,
        }
    }
@@ -1849,6 +1870,7 @@ mod tests {
    use std::sync::Arc;

    use datafusion::physical_plan::expressions::lit as physical_lit;
+    use datafusion_common::ScalarValue;
    use datafusion_expr::{col, lit};
    use datatypes::value::Value;
    use partition::expr::col as partition_col;
@@ -2035,13 +2057,18 @@ mod tests {
        assert!(scan_region.use_flat_format());
    }

+    /// Helper to create a timestamp millisecond literal.
+    fn ts_lit(val: i64) -> datafusion_expr::Expr {
+        lit(ScalarValue::TimestampMillisecond(Some(val), None))
+    }
+
    #[tokio::test]
    async fn test_build_scan_fingerprint_for_eligible_scan() {
        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
        let input = new_scan_input(
            metadata.clone(),
            vec![
-                col("ts").gt_eq(lit(1000)),
+                col("ts").gt_eq(ts_lit(1000)),
                col("k0").eq(lit("foo")),
                col("v0").gt(lit(1)),
            ],
@@ -2071,7 +2098,7 @@ mod tests {
                col("k0").eq(lit("foo")).to_string(),
                col("v0").gt(lit(1)).to_string(),
            ],
-            time_filters: vec![col("ts").gt_eq(lit(1000)).to_string()],
+            time_filters: vec![col("ts").gt_eq(ts_lit(1000)).to_string()],
            series_row_selector: Some(TimeSeriesRowSelector::LastRow),
            append_mode: false,
            filter_deleted: false,
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -247,6 +247,12 @@ pub(crate) struct ScanMetricsSet {
    num_range_builders: isize,
    /// Peak number of file range builders.
    num_peak_range_builders: isize,
+    /// Total bytes added to the range cache during this scan.
+    range_cache_size: usize,
+    /// Number of range cache hits during this scan.
+    range_cache_hit: usize,
+    /// Number of range cache misses during this scan.
+    range_cache_miss: usize,
 }

 /// Wrapper for file metrics that compares by total cost in reverse order.
@@ -345,6 +351,9 @@ impl fmt::Debug for ScanMetricsSet {
            build_ranges_peak_mem_size,
            num_range_builders: _,
            num_peak_range_builders,
+            range_cache_size,
+            range_cache_hit,
+            range_cache_miss,
        } = self;

        // Write core metrics
@@ -590,6 +599,16 @@ impl fmt::Debug for ScanMetricsSet {
            write!(f, "}}")?;
        }

+        if *range_cache_size > 0 {
+            write!(f, ", \"range_cache_size\":{range_cache_size}")?;
+        }
+        if *range_cache_hit > 0 {
+            write!(f, ", \"range_cache_hit\":{range_cache_hit}")?;
+        }
+        if *range_cache_miss > 0 {
+            write!(f, ", \"range_cache_miss\":{range_cache_miss}")?;
+        }
+
        write!(
            f,
            ", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \
@@ -1097,6 +1116,27 @@ impl PartitionMetrics {
    pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
        self.0.clone()
    }
+
+    /// Increments the total bytes added to the range cache.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_size(&self, size: usize) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_size += size;
+    }
+
+    /// Increments the range cache hit counter.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_hit(&self) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_hit += 1;
+    }
+
+    /// Increments the range cache miss counter.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_miss(&self) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_miss += 1;
+    }
 }

 impl fmt::Debug for PartitionMetrics {
--- a/src/mito2/src/read/stream.rs
+++ b/src/mito2/src/read/stream.rs
@@ -99,7 +99,8 @@ impl ConvertBatchStream {
                        let mapper = self.projection_mapper.as_flat().unwrap();

                        for batch in flat_batch.batches {
-                            self.pending.push_back(mapper.convert(&batch)?);
+                            self.pending
+                                .push_back(mapper.convert(&batch, &self.cache_strategy)?);
                        }
                    }
                }
@@ -114,7 +115,7 @@ impl ConvertBatchStream {
                // Safety: Only flat format returns this batch.
                let mapper = self.projection_mapper.as_flat().unwrap();

-                mapper.convert(&df_record_batch)
+                mapper.convert(&df_record_batch, &self.cache_strategy)
            }
        }
    }
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -29,6 +29,7 @@ pub mod flat_format;
 pub mod format;
 pub(crate) mod helper;
 pub(crate) mod metadata;
+pub mod prefilter;
 pub mod reader;
 pub mod row_group;
 pub mod row_selection;
--- a/src/mito2/src/sst/parquet/prefilter.rs
+++ b/src/mito2/src/sst/parquet/prefilter.rs
@@ -0,0 +1,528 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Helpers for parquet prefiltering.
+
+use std::ops::Range;
+
+use api::v1::SemanticType;
+use common_recordbatch::filter::SimpleFilterEvaluator;
+use datatypes::arrow::array::{BinaryArray, BooleanArray};
+use datatypes::arrow::record_batch::RecordBatch;
+use mito_codec::primary_key_filter::is_partition_column;
+use mito_codec::row_converter::PrimaryKeyFilter;
+use snafu::{OptionExt, ResultExt};
+use store_api::metadata::{RegionMetadata, RegionMetadataRef};
+
+use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu};
+use crate::sst::parquet::flat_format::primary_key_column_index;
+use crate::sst::parquet::format::PrimaryKeyArray;
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn matching_row_ranges_by_primary_key(
+    input: &RecordBatch,
+    pk_filter: &mut dyn PrimaryKeyFilter,
+) -> Result<Vec<Range<usize>>> {
+    let primary_key_index = primary_key_column_index(input.num_columns());
+    let pk_dict_array = input
+        .column(primary_key_index)
+        .as_any()
+        .downcast_ref::<PrimaryKeyArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key column is not a dictionary array",
+        })?;
+    let pk_values = pk_dict_array
+        .values()
+        .as_any()
+        .downcast_ref::<BinaryArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key values are not binary array",
+        })?;
+    let keys = pk_dict_array.keys();
+    let key_values = keys.values();
+
+    if key_values.is_empty() {
+        return Ok(std::iter::once(0..input.num_rows()).collect());
+    }
+
+    let mut matched_row_ranges: Vec<Range<usize>> = Vec::new();
+    let mut start = 0;
+    while start < key_values.len() {
+        let key = key_values[start];
+        let mut end = start + 1;
+        while end < key_values.len() && key_values[end] == key {
+            end += 1;
+        }
+
+        if pk_filter.matches(pk_values.value(key as usize)) {
+            if let Some(last) = matched_row_ranges.last_mut()
+                && last.end == start
+            {
+                last.end = end;
+            } else {
+                matched_row_ranges.push(start..end);
+            }
+        }
+
+        start = end;
+    }
+
+    Ok(matched_row_ranges)
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn prefilter_flat_batch_by_primary_key(
+    input: RecordBatch,
+    pk_filter: &mut dyn PrimaryKeyFilter,
+) -> Result<Option<RecordBatch>> {
+    if input.num_rows() == 0 {
+        return Ok(Some(input));
+    }
+
+    let matched_row_ranges = matching_row_ranges_by_primary_key(&input, pk_filter)?;
+    if matched_row_ranges.is_empty() {
+        return Ok(None);
+    }
+
+    if matched_row_ranges.len() == 1
+        && matched_row_ranges[0].start == 0
+        && matched_row_ranges[0].end == input.num_rows()
+    {
+        return Ok(Some(input));
+    }
+
+    if matched_row_ranges.len() == 1 {
+        let span = &matched_row_ranges[0];
+        return Ok(Some(input.slice(span.start, span.end - span.start)));
+    }
+
+    let mut mask = vec![false; input.num_rows()];
+    for span in matched_row_ranges {
+        mask[span].fill(true);
+    }
+
+    let filtered =
+        datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
+            .context(ComputeArrowSnafu)?;
+    if filtered.num_rows() == 0 {
+        Ok(None)
+    } else {
+        Ok(Some(filtered))
+    }
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn retain_usable_primary_key_filters(
+    sst_metadata: &RegionMetadataRef,
+    expected_metadata: Option<&RegionMetadata>,
+    filters: &mut Vec<SimpleFilterEvaluator>,
+) {
+    filters.retain(|filter| is_usable_primary_key_filter(sst_metadata, expected_metadata, filter));
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn is_usable_primary_key_filter(
+    sst_metadata: &RegionMetadataRef,
+    expected_metadata: Option<&RegionMetadata>,
+    filter: &SimpleFilterEvaluator,
+) -> bool {
+    // TODO(yingwen): The primary key filter always skips the partition column. Consider using a flag
+    // to control this behavior. We can remove this behavior after we remove the PartitionTreeMemtable.
+    if is_partition_column(filter.column_name()) {
+        return false;
+    }
+
+    let sst_column = match expected_metadata {
+        Some(expected_metadata) => {
+            let Some(expected_column) = expected_metadata.column_by_name(filter.column_name())
+            else {
+                return false;
+            };
+            let Some(sst_column) = sst_metadata.column_by_id(expected_column.column_id) else {
+                return false;
+            };
+
+            if sst_column.column_schema.name != expected_column.column_schema.name
+                || sst_column.semantic_type != expected_column.semantic_type
+                || sst_column.column_schema.data_type != expected_column.column_schema.data_type
+            {
+                return false;
+            }
+
+            sst_column
+        }
+        None => {
+            let Some(sst_column) = sst_metadata.column_by_name(filter.column_name()) else {
+                return false;
+            };
+            sst_column
+        }
+    };
+
+    sst_column.semantic_type == SemanticType::Tag
+        && sst_metadata
+            .primary_key_index(sst_column.column_id)
+            .is_some()
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) struct CachedPrimaryKeyFilter {
+    inner: Box<dyn PrimaryKeyFilter>,
+    last_primary_key: Vec<u8>,
+    last_match: Option<bool>,
+}
+
+impl CachedPrimaryKeyFilter {
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn new(inner: Box<dyn PrimaryKeyFilter>) -> Self {
+        Self {
+            inner,
+            last_primary_key: Vec::new(),
+            last_match: None,
+        }
+    }
+}
+
+impl PrimaryKeyFilter for CachedPrimaryKeyFilter {
+    fn matches(&mut self, pk: &[u8]) -> bool {
+        if let Some(last_match) = self.last_match
+            && self.last_primary_key == pk
+        {
+            return last_match;
+        }
+
+        let matched = self.inner.matches(pk);
+        self.last_primary_key.clear();
+        self.last_primary_key.extend_from_slice(pk);
+        self.last_match = Some(matched);
+        matched
+    }
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn batch_single_primary_key(batch: &RecordBatch) -> Result<Option<&[u8]>> {
+    let primary_key_index = primary_key_column_index(batch.num_columns());
+    let pk_dict_array = batch
+        .column(primary_key_index)
+        .as_any()
+        .downcast_ref::<PrimaryKeyArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key column is not a dictionary array",
+        })?;
+    let pk_values = pk_dict_array
+        .values()
+        .as_any()
+        .downcast_ref::<BinaryArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key values are not binary array",
+        })?;
+    let keys = pk_dict_array.keys();
+    if keys.is_empty() {
+        return Ok(None);
+    }
+
+    let first_key = keys.value(0);
+    if first_key != keys.value(keys.len() - 1) {
+        return Ok(None);
+    }
+
+    Ok(Some(pk_values.value(first_key as usize)))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use api::v1::SemanticType;
+    use common_recordbatch::filter::SimpleFilterEvaluator;
+    use datafusion_expr::{col, lit};
+    use datatypes::arrow::array::{
+        ArrayRef, BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
+        UInt64Array,
+    };
+    use datatypes::arrow::datatypes::{Schema, UInt32Type};
+    use datatypes::arrow::record_batch::RecordBatch;
+    use datatypes::prelude::ConcreteDataType;
+    use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec};
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
+    use store_api::storage::ColumnSchema;
+
+    use super::*;
+    use crate::sst::internal_fields;
+    use crate::sst::parquet::format::ReadFormat;
+    use crate::test_util::sst_util::{
+        new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding,
+    };
+
+    fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec<SimpleFilterEvaluator> {
+        exprs
+            .iter()
+            .filter_map(SimpleFilterEvaluator::try_new)
+            .collect()
+    }
+
+    fn expected_metadata_with_reused_tag_name(
+        old_metadata: &RegionMetadata,
+    ) -> Arc<RegionMetadata> {
+        let mut builder = RegionMetadataBuilder::new(old_metadata.region_id);
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "tag_0".to_string(),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: 10,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "tag_1".to_string(),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field_0".to_string(),
+                    ConcreteDataType::uint64_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 2,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts".to_string(),
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 3,
+            })
+            .primary_key(vec![10, 1]);
+
+        Arc::new(builder.build().unwrap())
+    }
+
+    fn new_raw_batch_with_metadata(
+        metadata: Arc<RegionMetadata>,
+        primary_keys: &[&[u8]],
+        field_values: &[u64],
+    ) -> RecordBatch {
+        assert_eq!(primary_keys.len(), field_values.len());
+
+        let arrow_schema = metadata.schema.arrow_schema();
+        let field_column = arrow_schema
+            .field(arrow_schema.index_of("field_0").unwrap())
+            .clone();
+        let time_index_column = arrow_schema
+            .field(arrow_schema.index_of("ts").unwrap())
+            .clone();
+        let mut fields = vec![field_column, time_index_column];
+        fields.extend(
+            internal_fields()
+                .into_iter()
+                .map(|field| field.as_ref().clone()),
+        );
+        let schema = Arc::new(Schema::new(fields));
+
+        let mut dict_values = Vec::new();
+        let mut keys = Vec::with_capacity(primary_keys.len());
+        for pk in primary_keys {
+            let key = dict_values
+                .iter()
+                .position(|existing: &&[u8]| existing == pk)
+                .unwrap_or_else(|| {
+                    dict_values.push(*pk);
+                    dict_values.len() - 1
+                });
+            keys.push(key as u32);
+        }
+
+        let pk_array: ArrayRef = Arc::new(DictionaryArray::<UInt32Type>::new(
+            UInt32Array::from(keys),
+            Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())),
+        ));
+
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(UInt64Array::from(field_values.to_vec())),
+                Arc::new(TimestampMillisecondArray::from_iter_values(
+                    0..primary_keys.len() as i64,
+                )),
+                pk_array,
+                Arc::new(UInt64Array::from(vec![1; primary_keys.len()])),
+                Arc::new(UInt8Array::from(vec![1; primary_keys.len()])),
+            ],
+        )
+        .unwrap()
+    }
+
+    fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch {
+        new_raw_batch_with_metadata(Arc::new(sst_region_metadata()), primary_keys, field_values)
+    }
+
+    fn field_values(batch: &RecordBatch) -> Vec<u64> {
+        batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap()
+            .values()
+            .to_vec()
+    }
+
+    #[test]
+    fn test_retain_usable_primary_key_filters_skips_non_tag_filters() {
+        let metadata = Arc::new(sst_region_metadata());
+        let mut filters =
+            new_test_filters(&[col("field_0").eq(lit(1_u64)), col("ts").gt(lit(0_i64))]);
+
+        retain_usable_primary_key_filters(&metadata, None, &mut filters);
+
+        assert!(filters.is_empty());
+    }
+
+    #[test]
+    fn test_retain_usable_primary_key_filters_skips_reused_expected_tag_name() {
+        let metadata = Arc::new(sst_region_metadata());
+        let expected_metadata = expected_metadata_with_reused_tag_name(&metadata);
+        let mut filters = new_test_filters(&[col("tag_0").eq(lit("b"))]);
+
+        retain_usable_primary_key_filters(
+            &metadata,
+            Some(expected_metadata.as_ref()),
+            &mut filters,
+        );
+
+        assert!(filters.is_empty());
+    }
+
+    #[test]
+    fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() {
+        let metadata = Arc::new(sst_region_metadata_with_encoding(
+            PrimaryKeyEncoding::Sparse,
+        ));
+        let read_format = ReadFormat::new_flat(
+            metadata.clone(),
+            metadata.column_metadatas.iter().map(|c| c.column_id),
+            None,
+            "test",
+            true,
+        )
+        .unwrap();
+        assert!(read_format.as_flat().is_some());
+
+        let filter = SimpleFilterEvaluator::try_new(&col("tag_0").eq(lit("b"))).unwrap();
+        assert!(is_usable_primary_key_filter(&metadata, None, &filter));
+    }
+
+    #[test]
+    fn test_prefilter_primary_key_drops_single_dictionary_batch() {
+        let metadata = Arc::new(sst_region_metadata());
+        let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))]));
+        let mut primary_key_filter =
+            build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
+        let pk_a = new_primary_key(&["a", "x"]);
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
+
+        let filtered =
+            prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()).unwrap();
+
+        assert!(filtered.is_none());
+    }
+
+    #[test]
+    fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() {
+        let metadata = Arc::new(sst_region_metadata());
+        let filters = Arc::new(new_test_filters(&[col("tag_0")
+            .eq(lit("a"))
+            .or(col("tag_0").eq(lit("c")))]));
+        let mut primary_key_filter =
+            build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
+        let pk_a = new_primary_key(&["a", "x"]);
+        let pk_b = new_primary_key(&["b", "x"]);
+        let pk_c = new_primary_key(&["c", "x"]);
+        let pk_d = new_primary_key(&["d", "x"]);
+        let batch = new_raw_batch(
+            &[
+                pk_a.as_slice(),
+                pk_a.as_slice(),
+                pk_b.as_slice(),
+                pk_b.as_slice(),
+                pk_c.as_slice(),
+                pk_c.as_slice(),
+                pk_d.as_slice(),
+                pk_d.as_slice(),
+            ],
+            &[10, 11, 12, 13, 14, 15, 16, 17],
+        );
+
+        let filtered = prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut())
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(filtered.num_rows(), 4);
+        assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]);
+    }
+
+    struct CountingPrimaryKeyFilter {
+        hits: Arc<AtomicUsize>,
+        expected: Vec<u8>,
+    }
+
+    impl PrimaryKeyFilter for CountingPrimaryKeyFilter {
+        fn matches(&mut self, pk: &[u8]) -> bool {
+            self.hits.fetch_add(1, Ordering::Relaxed);
+            pk == self.expected.as_slice()
+        }
+    }
+
+    #[test]
+    fn test_cached_primary_key_filter_reuses_previous_result() {
+        let expected = new_primary_key(&["a", "x"]);
+        let hits = Arc::new(AtomicUsize::new(0));
+        let mut filter = CachedPrimaryKeyFilter::new(Box::new(CountingPrimaryKeyFilter {
+            hits: Arc::clone(&hits),
+            expected: expected.clone(),
+        }));
+
+        assert!(filter.matches(expected.as_slice()));
+        assert!(filter.matches(expected.as_slice()));
+        assert!(!filter.matches(new_primary_key(&["b", "x"]).as_slice()));
+
+        assert_eq!(hits.load(Ordering::Relaxed), 2);
+    }
+
+    #[test]
+    fn test_batch_single_primary_key() {
+        let pk_a = new_primary_key(&["a", "x"]);
+        let pk_b = new_primary_key(&["b", "x"]);
+
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
+        assert_eq!(
+            batch_single_primary_key(&batch).unwrap(),
+            Some(pk_a.as_slice())
+        );
+
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_b.as_slice()], &[10, 11]);
+        assert_eq!(batch_single_primary_key(&batch).unwrap(), None);
+    }
+}
--- a/src/mito2/src/test_util.rs
+++ b/src/mito2/src/test_util.rs
@@ -15,6 +15,7 @@
 //! Utilities for testing.

 pub mod batch_util;
+pub mod bench_util;
 pub mod memtable_util;
 pub mod scheduler_util;
 pub mod sst_util;
--- a/src/mito2/src/test_util/bench_util.rs
+++ b/src/mito2/src/test_util/bench_util.rs
@@ -0,0 +1,259 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared utilities for mito2 benchmarks.
+//!
+//! Provides a TSBS cpu-like data generator ([`CpuDataGenerator`]) and schema
+//! ([`cpu_metadata`]) used by multiple benchmark binaries in this directory.
+
+use api::v1::value::ValueData;
+use api::v1::{Row, Rows, SemanticType};
+use datafusion_common::Column;
+use datafusion_expr::{Expr, lit};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::schema::ColumnSchema;
+use rand::Rng;
+use rand::rngs::ThreadRng;
+use rand::seq::IndexedRandom;
+use store_api::metadata::{
+    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
+};
+use store_api::storage::RegionId;
+use table::predicate::Predicate;
+
+use crate::memtable::KeyValues;
+use crate::test_util::memtable_util::region_metadata_to_row_schema;
+
+pub struct Host {
+    pub hostname: String,
+    pub region: String,
+    pub datacenter: String,
+    pub rack: String,
+    pub os: String,
+    pub arch: String,
+    pub team: String,
+    pub service: String,
+    pub service_version: String,
+    pub service_environment: String,
+}
+
+impl Host {
+    pub fn random_with_id(id: usize) -> Host {
+        let mut rng = rand::rng();
+        let region = format!("ap-southeast-{}", rng.random_range(0..10));
+        let datacenter = format!(
+            "{}{}",
+            region,
+            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
+        );
+        Host {
+            hostname: format!("host_{id}"),
+            region,
+            datacenter,
+            rack: rng.random_range(0..100).to_string(),
+            os: "Ubuntu16.04LTS".to_string(),
+            arch: "x86".to_string(),
+            team: "CHI".to_string(),
+            service: rng.random_range(0..100).to_string(),
+            service_version: rng.random_range(0..10).to_string(),
+            service_environment: "test".to_string(),
+        }
+    }
+
+    pub fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
+        let tags = [
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.hostname.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.region.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.rack.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.os.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.arch.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.team.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_version.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
+            },
+        ];
+        for tag in tags {
+            values.push(tag);
+        }
+    }
+}
+
+pub struct CpuDataGenerator {
+    pub metadata: RegionMetadataRef,
+    column_schemas: Vec<api::v1::ColumnSchema>,
+    hosts: Vec<Host>,
+    start_sec: i64,
+    end_sec: i64,
+}
+
+impl CpuDataGenerator {
+    pub fn new(
+        metadata: RegionMetadataRef,
+        num_hosts: usize,
+        start_sec: i64,
+        end_sec: i64,
+    ) -> Self {
+        let column_schemas = region_metadata_to_row_schema(&metadata);
+        Self {
+            metadata,
+            column_schemas,
+            hosts: Self::generate_hosts(num_hosts),
+            start_sec,
+            end_sec,
+        }
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
+        // point per 10s.
+        (self.start_sec..self.end_sec)
+            .step_by(10)
+            .enumerate()
+            .map(|(seq, ts)| self.build_key_values(seq, ts))
+    }
+
+    pub fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
+        let rows = self
+            .hosts
+            .iter()
+            .map(|host| {
+                let mut rng = rand::rng();
+                let mut values = Vec::with_capacity(21);
+                values.push(api::v1::Value {
+                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
+                });
+                host.fill_values(&mut values);
+                for _ in 0..10 {
+                    values.push(api::v1::Value {
+                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
+                    });
+                }
+                Row { values }
+            })
+            .collect();
+        let mutation = api::v1::Mutation {
+            op_type: api::v1::OpType::Put as i32,
+            sequence: seq as u64,
+            rows: Some(Rows {
+                schema: self.column_schemas.clone(),
+                rows,
+            }),
+            write_hint: None,
+        };
+
+        KeyValues::new(&self.metadata, mutation).unwrap()
+    }
+
+    pub fn random_host_filter(&self) -> Predicate {
+        let host = self.random_hostname();
+        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
+        Predicate::new(vec![expr])
+    }
+
+    pub fn random_host_filter_exprs(&self) -> Vec<Expr> {
+        let host = self.random_hostname();
+        vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
+    }
+
+    pub fn random_hostname(&self) -> String {
+        let mut rng = rand::rng();
+        self.hosts.choose(&mut rng).unwrap().hostname.clone()
+    }
+
+    pub fn random_f64(rng: &mut ThreadRng) -> f64 {
+        let base: u32 = rng.random_range(30..95);
+        base as f64
+    }
+
+    pub fn generate_hosts(num_hosts: usize) -> Vec<Host> {
+        (0..num_hosts).map(Host::random_with_id).collect()
+    }
+}
+
+/// Creates a metadata for TSBS cpu-like table.
+pub fn cpu_metadata() -> RegionMetadata {
+    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
+    builder.push_column_metadata(ColumnMetadata {
+        column_schema: ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        ),
+        semantic_type: SemanticType::Timestamp,
+        column_id: 0,
+    });
+    let mut column_id = 1;
+    let tags = [
+        "hostname",
+        "region",
+        "datacenter",
+        "rack",
+        "os",
+        "arch",
+        "team",
+        "service",
+        "service_version",
+        "service_environment",
+    ];
+    for tag in tags {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
+            semantic_type: SemanticType::Tag,
+            column_id,
+        });
+        column_id += 1;
+    }
+    let fields = [
+        "usage_user",
+        "usage_system",
+        "usage_idle",
+        "usage_nice",
+        "usage_iowait",
+        "usage_irq",
+        "usage_softirq",
+        "usage_steal",
+        "usage_guest",
+        "usage_guest_nice",
+    ];
+    for field in fields {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
+            semantic_type: SemanticType::Field,
+            column_id,
+        });
+        column_id += 1;
+    }
+    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+    builder.build().unwrap()
+}
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -30,8 +30,7 @@ use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortFi
 use store_api::metadata::{
    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
 };
-use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange};
-use table::predicate::Predicate;
+use store_api::storage::{ColumnId, RegionId, SequenceNumber};

 use crate::error::Result;
 use crate::memtable::bulk::part::BulkPart;
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -207,6 +207,7 @@ impl WorkerGroup {
                .vector_cache_size(config.vector_cache_size.as_bytes())
                .page_cache_size(config.page_cache_size.as_bytes())
                .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+                .range_result_cache_size(config.range_result_cache_size.as_bytes())
                .index_metadata_size(config.index.metadata_cache_size.as_bytes())
                .index_content_size(config.index.content_cache_size.as_bytes())
                .index_content_page_size(config.index.content_cache_page_size.as_bytes())
@@ -421,6 +422,7 @@ impl WorkerGroup {
                .vector_cache_size(config.vector_cache_size.as_bytes())
                .page_cache_size(config.page_cache_size.as_bytes())
                .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+                .range_result_cache_size(config.range_result_cache_size.as_bytes())
                .write_cache(write_cache)
                .build(),
        );
--- a/src/partition/src/cache.rs
+++ b/src/partition/src/cache.rs
@@ -121,10 +121,12 @@ pub fn new_partition_info_cache(
    CacheContainer::new(
        name,
        cache,
-        Box::new(|cache, ident| {
+        Box::new(|cache, idents| {
            Box::pin(async move {
-                if let CacheIdent::TableId(table_id) = ident {
-                    cache.invalidate(table_id).await
+                for ident in idents {
+                    if let CacheIdent::TableId(table_id) = ident {
+                        cache.invalidate(table_id).await
+                    }
                }
                Ok(())
            })
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -3315,28 +3315,55 @@ impl PromPlanner {
    fn prom_token_to_binary_expr_builder(
        token: TokenType,
    ) -> Result<Box<dyn Fn(DfExpr, DfExpr) -> Result<DfExpr>>> {
+        let cast_float = |expr| {
+            if matches!(
+                &expr,
+                DfExpr::Cast(Cast {
+                    data_type: ArrowDataType::Float64,
+                    ..
+                })
+            ) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _))
+            {
+                expr
+            } else {
+                DfExpr::Cast(Cast {
+                    expr: Box::new(expr),
+                    data_type: ArrowDataType::Float64,
+                })
+            }
+        };
        match token.id() {
-            token::T_ADD => Ok(Box::new(|lhs, rhs| Ok(lhs + rhs))),
-            token::T_SUB => Ok(Box::new(|lhs, rhs| Ok(lhs - rhs))),
-            token::T_MUL => Ok(Box::new(|lhs, rhs| Ok(lhs * rhs))),
-            token::T_DIV => Ok(Box::new(|lhs, rhs| Ok(lhs / rhs))),
-            token::T_MOD => Ok(Box::new(|lhs: DfExpr, rhs| Ok(lhs % rhs))),
+            token::T_ADD => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) + cast_float(rhs))
+            })),
+            token::T_SUB => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) - cast_float(rhs))
+            })),
+            token::T_MUL => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) * cast_float(rhs))
+            })),
+            token::T_DIV => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) / cast_float(rhs))
+            })),
+            token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| {
+                Ok(cast_float(lhs) % cast_float(rhs))
+            })),
            token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))),
            token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))),
            token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))),
            token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))),
            token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))),
            token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))),
-            token::T_POW => Ok(Box::new(|lhs, rhs| {
+            token::T_POW => Ok(Box::new(move |lhs, rhs| {
                Ok(DfExpr::ScalarFunction(ScalarFunction {
                    func: datafusion_functions::math::power(),
-                    args: vec![lhs, rhs],
+                    args: vec![cast_float(lhs), cast_float(rhs)],
                }))
            })),
-            token::T_ATAN2 => Ok(Box::new(|lhs, rhs| {
+            token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| {
                Ok(DfExpr::ScalarFunction(ScalarFunction {
                    func: datafusion_functions::math::atan2(),
-                    args: vec![lhs, rhs],
+                    args: vec![cast_float(lhs), cast_float(rhs)],
                }))
            })),
            _ => UnexpectedTokenSnafu { token }.fail(),
@@ -5161,7 +5188,7 @@ mod test {
                .unwrap();

        let expected = String::from(
-            "Projection: rhs.tag_0, rhs.timestamp, lhs.field_0 + rhs.field_0 AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
+            "Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
            \n  Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n    SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5216,7 +5243,7 @@ mod test {
    async fn binary_op_literal_column() {
        let query = r#"1 + some_metric{tag_0="bar"}"#;
        let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + some_metric.field_0 AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5254,7 +5281,7 @@ mod test {
    async fn bool_with_additional_arithmetic() {
        let query = "some_metric + (1 == bool 2)";
        let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, some_metric.field_0 + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
            \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5364,7 +5391,7 @@ mod test {
            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
                .await
                .unwrap();
-        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
+        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
            \n  Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\
            \n    SubqueryAlias: http_server_requests_seconds_sum\
            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
@@ -5755,7 +5782,7 @@ mod test {

        let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric";
        let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, greptime_private.some_alt_metric.field_0 / some_metric.field_0 AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
            \n  Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n    SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
            \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
--- a/src/servers/src/postgres/types.rs
+++ b/src/servers/src/postgres/types.rs
@@ -235,7 +235,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
    match origin {
        &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
        &ConcreteDataType::Boolean(_) => Ok(Type::BOOL),
-        &ConcreteDataType::Int8(_) => Ok(Type::CHAR),
+        &ConcreteDataType::Int8(_) => Ok(Type::INT2),
        &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2),
        &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4),
        &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8),
@@ -253,7 +253,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
        ConcreteDataType::List(list) => match list.item_type() {
            &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
            &ConcreteDataType::Boolean(_) => Ok(Type::BOOL_ARRAY),
-            &ConcreteDataType::Int8(_) => Ok(Type::CHAR_ARRAY),
+            &ConcreteDataType::Int8(_) => Ok(Type::INT2_ARRAY),
            &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2_ARRAY),
            &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4_ARRAY),
            &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8_ARRAY),
@@ -1151,7 +1151,7 @@ mod test {
        let pg_field_info = vec![
            FieldInfo::new("nulls".into(), None, None, Type::UNKNOWN, FieldFormat::Text),
            FieldInfo::new("bools".into(), None, None, Type::BOOL, FieldFormat::Text),
-            FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
+            FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
            FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
            FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
            FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),
@@ -1230,7 +1230,7 @@ mod test {
                Type::NUMERIC,
                FieldFormat::Text,
            ),
-            FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
+            FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
            FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
            FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
            FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),
--- a/src/servers/src/query_handler/grpc.rs
+++ b/src/servers/src/query_handler/grpc.rs
@@ -17,15 +17,13 @@ use std::sync::Arc;

 use api::v1::greptime_request::Request;
 use async_trait::async_trait;
-use common_base::AffectedRows;
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use futures::Stream;
 use session::context::QueryContextRef;
-use table::TableRef;

 use crate::error::Result;
-use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
+use crate::grpc::flight::PutRecordBatchRequestStream;

 pub type ServerGrpcQueryHandlerRef = Arc<dyn GrpcQueryHandler + Send + Sync>;

@@ -35,13 +33,6 @@ pub type RawRecordBatch = bytes::Bytes;
 pub trait GrpcQueryHandler {
    async fn do_query(&self, query: Request, ctx: QueryContextRef) -> Result<Output>;

-    async fn put_record_batch(
-        &self,
-        request: PutRecordBatchRequest,
-        table_ref: &mut Option<TableRef>,
-        ctx: QueryContextRef,
-    ) -> Result<AffectedRows>;
-
    fn handle_put_record_batch_stream(
        &self,
        stream: PutRecordBatchRequestStream,
--- a/src/servers/tests/mod.rs
+++ b/src/servers/tests/mod.rs
@@ -18,7 +18,6 @@ use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
 use async_trait::async_trait;
 use catalog::memory::MemoryCatalogManager;
-use common_base::AffectedRows;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
@@ -149,15 +148,6 @@ impl GrpcQueryHandler for DummyInstance {
        Ok(output)
    }

-    async fn put_record_batch(
-        &self,
-        _request: servers::grpc::flight::PutRecordBatchRequest,
-        _table_ref: &mut Option<TableRef>,
-        _ctx: QueryContextRef,
-    ) -> Result<AffectedRows> {
-        unimplemented!()
-    }
-
    fn handle_put_record_batch_stream(
        &self,
        _stream: servers::grpc::flight::PutRecordBatchRequestStream,
--- a/src/table/src/predicate.rs
+++ b/src/table/src/predicate.rs
@@ -203,7 +203,7 @@ pub fn build_time_range_predicate(

 /// Extract time range filter from `WHERE`/`IN (...)`/`BETWEEN` clauses.
 /// Return None if no time range can be found in expr.
-fn extract_time_range_from_expr(
+pub fn extract_time_range_from_expr(
    ts_col_name: &str,
    ts_col_unit: TimeUnit,
    expr: &Expr,
--- a/src/table/src/requests.rs
+++ b/src/table/src/requests.rs
@@ -36,8 +36,9 @@ use store_api::metric_engine_consts::{
    LOGICAL_TABLE_METADATA_KEY, PHYSICAL_TABLE_METADATA_KEY, is_metric_engine_option_key,
 };
 use store_api::mito_engine_options::{
-    APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, TWCS_FALLBACK_TO_LOCAL,
-    TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, is_mito_engine_option_key,
+    APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, SST_FORMAT_KEY,
+    TWCS_FALLBACK_TO_LOCAL, TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM,
+    is_mito_engine_option_key,
 };
 use store_api::region_request::{SetRegionOption, UnsetRegionOption};

@@ -56,13 +57,14 @@ pub const TABLE_DATA_MODEL_TRACE_V1: &str = "greptime_trace_v1";
 pub const OTLP_METRIC_COMPAT_KEY: &str = "otlp_metric_compat";
 pub const OTLP_METRIC_COMPAT_PROM: &str = "prom";

-pub const VALID_TABLE_OPTION_KEYS: [&str; 12] = [
+pub const VALID_TABLE_OPTION_KEYS: [&str; 13] = [
    // common keys:
    WRITE_BUFFER_SIZE_KEY,
    TTL_KEY,
    STORAGE_KEY,
    COMMENT_KEY,
    SKIP_WAL_KEY,
+    SST_FORMAT_KEY,
    // file engine keys:
    FILE_TABLE_LOCATION_KEY,
    FILE_TABLE_FORMAT_KEY,
@@ -94,6 +96,7 @@ static VALID_DB_OPT_KEYS: Lazy<HashSet<&str>> = Lazy::new(|| {
    set.insert(TWCS_TIME_WINDOW);
    set.insert(TWCS_TRIGGER_FILE_NUM);
    set.insert(TWCS_MAX_OUTPUT_FILE_SIZE);
+    set.insert(SST_FORMAT_KEY);
    set
 });

--- a/tests-integration/src/tests/promql_test.rs
+++ b/tests-integration/src/tests/promql_test.rs
@@ -15,7 +15,9 @@
 use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};

-use common_query::Output;
+use common_query::{Output, OutputData};
+use common_recordbatch::util::collect_batches;
+use datatypes::arrow::array::{Float64Array, Int64Array};
 use frontend::instance::Instance;
 use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
 use rstest::rstest;
@@ -151,6 +153,103 @@ async fn create_insert_tql_assert(
    check_unordered_output_stream(query_output, expected).await;
 }

+async fn execute_all(instance: &Arc<Instance>, sql: &str, query_ctx: Arc<QueryContext>) {
+    instance
+        .do_query(sql, query_ctx)
+        .await
+        .into_iter()
+        .for_each(|v| {
+            let _ = v.unwrap();
+        });
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn promql_query_as_batches(
+    ins: Arc<Instance>,
+    promql: &str,
+    alias: Option<String>,
+    query_ctx: Arc<QueryContext>,
+    start: SystemTime,
+    end: SystemTime,
+    interval: Duration,
+    lookback: Duration,
+) -> common_recordbatch::RecordBatches {
+    let output = promql_query(
+        ins, promql, alias, query_ctx, start, end, interval, lookback,
+    )
+    .await
+    .unwrap();
+    match output.data {
+        OutputData::Stream(stream) => collect_batches(stream).await.unwrap(),
+        OutputData::RecordBatches(recordbatches) => recordbatches,
+        _ => unreachable!(),
+    }
+}
+
+const ANON_PROMQL_RATIO_REPRO_DB: &str = "repro_db";
+
+const ANON_PROMQL_RATIO_REPRO_CREATE: &str = r#"
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+"#;
+
+const ANON_PROMQL_RATIO_REPRO_INSERT: &str = r#"
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+"#;
+
+const ANON_PROMQL_RATIO_REPRO_NUMERATOR: &str = r#"count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50))"#;
+
+const ANON_PROMQL_RATIO_REPRO_DENOMINATOR: &str =
+    r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))"#;
+
+const ANON_PROMQL_RATIO_REPRO_WHOLE: &str = r#"(count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))) * 100"#;
+
+const ANON_PROMQL_RATIO_REPRO_SCALAR_DIV: &str =
+    r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m])) / 2"#;
+
 #[apply(both_instances_cases)]
 async fn sql_insert_tql_query_ceil(instance: Arc<dyn MockInstance>) {
    let instance = instance.frontend();
@@ -709,3 +808,140 @@ async fn cross_schema_query(instance: Arc<dyn MockInstance>) {

    check_unordered_output_stream(query_output, expected).await;
 }
+
+#[apply(both_instances_cases)]
+async fn anon_promql_ratio_repro(instance: Arc<dyn MockInstance>) {
+    let ins = instance.frontend();
+
+    execute_all(
+        &ins,
+        &format!("CREATE DATABASE {ANON_PROMQL_RATIO_REPRO_DB}"),
+        QueryContext::arc(),
+    )
+    .await;
+
+    let repro_ctx: Arc<QueryContext> =
+        QueryContext::with_db_name(Some(ANON_PROMQL_RATIO_REPRO_DB)).into();
+    execute_all(&ins, ANON_PROMQL_RATIO_REPRO_CREATE, repro_ctx.clone()).await;
+    execute_all(&ins, ANON_PROMQL_RATIO_REPRO_INSERT, repro_ctx).await;
+
+    let start = UNIX_EPOCH.checked_add(Duration::from_secs(180)).unwrap();
+    let end = UNIX_EPOCH.checked_add(Duration::from_secs(360)).unwrap();
+    let interval = Duration::from_secs(180);
+    let lookback = Duration::from_secs(1);
+
+    let numerator = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_NUMERATOR,
+        Some("num".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let denominator = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_DENOMINATOR,
+        Some("den".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let whole = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_WHOLE,
+        Some("pct".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let scalar_div = promql_query_as_batches(
+        ins,
+        ANON_PROMQL_RATIO_REPRO_SCALAR_DIV,
+        Some("half_den".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+
+    let numerator = numerator.iter().collect::<Vec<_>>();
+    let denominator = denominator.iter().collect::<Vec<_>>();
+    let whole = whole.iter().collect::<Vec<_>>();
+    let scalar_div = scalar_div.iter().collect::<Vec<_>>();
+
+    let numerator_values = numerator[0]
+        .column_by_name("num")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .unwrap();
+    let denominator_values = denominator[0]
+        .column_by_name("den")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .unwrap();
+    let percentage_values = whole[0]
+        .column_by_name("pct")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .unwrap();
+    let scalar_div_values = scalar_div[0]
+        .column_by_name("half_den")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .unwrap();
+
+    assert_eq!(numerator_values.len(), 1, "{}", numerator[0].pretty_print());
+    assert_eq!(
+        denominator_values.len(),
+        1,
+        "{}",
+        denominator[0].pretty_print()
+    );
+    assert_eq!(percentage_values.len(), 1, "{}", whole[0].pretty_print());
+    assert_eq!(
+        scalar_div_values.len(),
+        1,
+        "{}",
+        scalar_div[0].pretty_print()
+    );
+
+    assert_eq!(
+        numerator_values.value(0),
+        1,
+        "{}",
+        numerator[0].pretty_print()
+    );
+    assert_eq!(
+        denominator_values.value(0),
+        3,
+        "{}",
+        denominator[0].pretty_print()
+    );
+    assert!(
+        (scalar_div_values.value(0) - 1.5).abs() < 1e-9,
+        "{}",
+        scalar_div[0].pretty_print()
+    );
+
+    let expected = 100.0 / 3.0;
+    assert!(
+        (percentage_values.value(0) - expected).abs() < 1e-9,
+        "{}",
+        whole[0].pretty_print()
+    );
+}
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -148,6 +148,7 @@ macro_rules! http_tests {
                test_jaeger_query_api_for_trace_v1,

                test_influxdb_write,
+                test_influxdb_write_with_hints,
                test_http_memory_limit,
            );
        )*
@@ -1641,6 +1642,7 @@ fn drop_lines_with_inconsistent_results(input: String) -> String {
        "metadata_cache_size =",
        "content_cache_size =",
        "result_cache_size =",
+        "range_result_cache_size =",
        "name =",
        "recovery_parallelism =",
        "max_background_index_builds =",
@@ -3638,6 +3640,43 @@ transform:
    guard.remove_all().await;
 }

+pub async fn test_influxdb_write_with_hints(storage_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_http_app_with_frontend(storage_type, "test_influxdb_write_with_hints").await;
+
+    let client = TestClient::new(app).await;
+
+    let result = client
+        .post("/v1/influxdb/write?db=public")
+        .header("x-greptime-hints", "sst_format=flat,ttl=30d,skip_wal=true")
+        .body("sst_fmt_table,host=host1 cpu=1.2 1664370459457010101")
+        .send()
+        .await;
+    assert_eq!(result.status(), 204);
+
+    let res = client
+        .get("/v1/sql?sql=show create table sst_fmt_table")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let resp = res.text().await;
+    assert!(
+        resp.contains("sst_format = 'flat'"),
+        "expected sst_format = 'flat' in SHOW CREATE TABLE output, got: {resp}"
+    );
+    assert!(
+        resp.contains("ttl = '30days'"),
+        "expected ttl = '30days' in SHOW CREATE TABLE output, got: {resp}"
+    );
+    assert!(
+        resp.contains("skip_wal = 'true'"),
+        "expected skip_wal = 'true' in SHOW CREATE TABLE output, got: {resp}"
+    );
+
+    guard.remove_all().await;
+}
+
 /// Test one-to-many VRL pipeline expansion.
 /// This test verifies that a VRL processor can return an array, which results in
 /// multiple output rows from a single input row.
--- a/tests/cases/distributed/explain/step_aggr_advance.result
+++ b/tests/cases/distributed/explain/step_aggr_advance.result
@@ -442,54 +442,54 @@ Affected Rows: 0
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));

-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
-|               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                      |
-|               |     MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               | SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               |   Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|               |     Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                       |
-|               |       Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |         Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                           |
-|               |           PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |             PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               |               PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-|               |                 Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                          |
-|               |                   Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |                     TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-|               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                     |
-|               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                           |
-|               |           Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |             Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                              |
-|               |               PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |                 PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|               |                   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |                     Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                        |
-|               |                       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                             |
-|               |   TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                               |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, CAST(aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) / CAST(aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
+|               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |     MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               | SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |   Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |     Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |       Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |         Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                                                             |
+|               |           PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |             PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |               PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |                 Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |                   Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |                     TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                             |
+|               |           Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |             Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                                                                |
+|               |               PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |                 PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+|               |                   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |                     Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                          |
+|               |                       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+|               |   TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                 |
 |               |   REDACTED
-|               |     CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |       AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                            |
-|               |         FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |           ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c]                                                                                                                                                                                                                                                                                                                 |
-|               |             PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp]                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |               PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-|               |                 PromSeriesDivideExec: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-|               |                   SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |     CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |       AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |         FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |           ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c]                                                                                                                                                                                                                                                                                                                                                   |
+|               |             PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |               PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|               |                 PromSeriesDivideExec: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |                   SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
 |               |                     MergeScanExec: REDACTED
-|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-|               |       CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |       CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 |               |         MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+|               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
--- a/tests/cases/standalone/common/alter/alter_database.result
+++ b/tests/cases/standalone/common/alter/alter_database.result
@@ -314,6 +314,85 @@ SHOW CREATE DATABASE alter_database;
 |                | )                                            |
 +----------------+----------------------------------------------+

+-- Test sst_format option
+ALTER DATABASE alter_database SET 'sst_format'='flat';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
+----------------+----------------------------------------------+
+| Database       | Create Database                              |
+----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs',                |
+|                |   sst_format = 'flat'                        |
+|                | )                                            |
+----------------+----------------------------------------------+
+
+USE alter_database;
+
+Affected Rows: 0
+
+CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
+
+Affected Rows: 0
+
+SHOW CREATE TABLE monitor;
+
+---------+----------------------------------------+
+| Table   | Create Table                           |
+---------+----------------------------------------+
+| monitor | CREATE TABLE IF NOT EXISTS "monitor" ( |
+|         |   "ts" TIMESTAMP(3) NOT NULL,          |
+|         |   TIME INDEX ("ts")                    |
+|         | )                                      |
+|         |                                        |
+|         | ENGINE=mito                            |
+|         | WITH(                                  |
+|         |   sst_format = 'flat'                  |
+|         | )                                      |
+---------+----------------------------------------+
+
+USE public;
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database SET 'sst_format'='primary_key';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
+----------------+----------------------------------------------+
+| Database       | Create Database                              |
+----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs',                |
+|                |   sst_format = 'primary_key'                 |
+|                | )                                            |
+----------------+----------------------------------------------+
+
+ALTER DATABASE alter_database UNSET 'sst_format';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
+----------------+----------------------------------------------+
+| Database       | Create Database                              |
+----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs'                 |
+|                | )                                            |
+----------------+----------------------------------------------+
+
 DROP DATABASE alter_database;

 Affected Rows: 0
--- a/tests/cases/standalone/common/alter/alter_database.sql
+++ b/tests/cases/standalone/common/alter/alter_database.sql
@@ -90,5 +90,25 @@ ALTER DATABASE alter_database UNSET 'ttl';

 SHOW CREATE DATABASE alter_database;

-DROP DATABASE alter_database;
+-- Test sst_format option
+ALTER DATABASE alter_database SET 'sst_format'='flat';

+SHOW CREATE DATABASE alter_database;
+
+USE alter_database;
+
+CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
+
+SHOW CREATE TABLE monitor;
+
+USE public;
+
+ALTER DATABASE alter_database SET 'sst_format'='primary_key';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database UNSET 'sst_format';
+
+SHOW CREATE DATABASE alter_database;
+
+DROP DATABASE alter_database;
--- a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result
+++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result
@@ -0,0 +1,106 @@
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+Affected Rows: 0
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+Affected Rows: 0
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+Affected Rows: 0
+
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+Affected Rows: 9
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+
+Affected Rows: 6
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
+
+---------------------+-------------------------------------------------------------------+
+| t                   | count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) |
+---------------------+-------------------------------------------------------------------+
+| 1970-01-01T00:03:00 | 1                                                                 |
+---------------------+-------------------------------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
+
+---------------------+---------------------------------------------+
+| t                   | count(prom_rate(t_range,v,t,Int64(180000))) |
+---------------------+---------------------------------------------+
+| 1970-01-01T00:03:00 | 3                                           |
+---------------------+---------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
+
+---------------------+----------------------------------------------------------+
+| t                   | count(prom_rate(t_range,v,t,Int64(180000))) / Float64(2) |
+---------------------+----------------------------------------------------------+
+| 1970-01-01T00:03:00 | 1.5                                                      |
+---------------------+----------------------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
+
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| t                   | metric_b.count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) / metric_a.count(prom_rate(t_range,v,t,Int64(180000))) * Float64(100) |
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| 1970-01-01T00:03:00 | 33.33333333333333                                                                                                                                |
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+
+DROP TABLE metric_a;
+
+Affected Rows: 0
+
+DROP TABLE metric_b;
+
+Affected Rows: 0
+
+DROP TABLE phy;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql
+++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql
@@ -0,0 +1,63 @@
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
+
+DROP TABLE metric_a;
+DROP TABLE metric_b;
+DROP TABLE phy;
--- a/tests/cases/standalone/common/tql/tql-cte.result
+++ b/tests/cases/standalone/common/tql/tql-cte.result
@@ -427,8 +427,8 @@ SELECT min(val) as min_computed, max(val) as max_computed FROM computed;
 |               |   Aggregate: groupBy=[[]], aggr=[[min(computed.val), max(computed.val)]]                                                    |
 |               |     SubqueryAlias: computed                                                                                                 |
 |               |       Projection: metric.ts AS ts, val * Float64(2) + Float64(1) AS val                                                     |
-|               |         Projection: metric.ts, val * Float64(2) + Float64(1) AS val * Float64(2) + Float64(1)                               |
-|               |           Projection: metric.ts, metric.val * Float64(2) AS val * Float64(2)                                                |
+|               |         Projection: metric.ts, CAST(val * Float64(2) AS Float64) + Float64(1) AS val * Float64(2) + Float64(1)              |
+|               |           Projection: metric.ts, CAST(metric.val AS Float64) * Float64(2) AS val * Float64(2)                               |
 |               |             PromInstantManipulate: range=[0..40000], lookback=[300000], interval=[10000], time index=[ts]                   |
 |               |               PromSeriesDivide: tags=[]                                                                                     |
 |               |                 Filter: metric.ts >= TimestampMillisecond(-299999, None) AND metric.ts <= TimestampMillisecond(40000, None) |